diff --git a/sys/contrib/openzfs/config/kernel-inode-times.m4 b/sys/contrib/openzfs/config/kernel-inode-times.m4
index 412e13b47df5..aae95abf1720 100644
--- a/sys/contrib/openzfs/config/kernel-inode-times.m4
+++ b/sys/contrib/openzfs/config/kernel-inode-times.m4
@@ -1,93 +1,93 @@
 AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [
 
 	dnl #
 	dnl # 5.6 API change
 	dnl # timespec64_trunc() replaced by timestamp_truncate() interface.
 	dnl #
 	ZFS_LINUX_TEST_SRC([timestamp_truncate], [
 		#include <linux/fs.h>
 	],[
 		struct timespec64 ts;
 		struct inode ip;
 
 		memset(&ts, 0, sizeof(ts));
 		ts = timestamp_truncate(ts, &ip);
 	])
 
 	dnl #
 	dnl # 4.18 API change
 	dnl # i_atime, i_mtime, and i_ctime changed from timespec to timespec64.
 	dnl #
 	ZFS_LINUX_TEST_SRC([inode_times], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
 		struct timespec ts;
 
 		memset(&ip, 0, sizeof(ip));
 		ts = ip.i_mtime;
 	])
 
 	dnl #
 	dnl # 6.6 API change
 	dnl # i_ctime no longer directly accessible, must use
 	dnl # inode_get_ctime(ip), inode_set_ctime*(ip) to
 	dnl # read/write.
 	dnl #
 	ZFS_LINUX_TEST_SRC([inode_get_ctime], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
 
 		memset(&ip, 0, sizeof(ip));
 		inode_get_ctime(&ip);
 	])
 
 	ZFS_LINUX_TEST_SRC([inode_set_ctime_to_ts], [
 		#include <linux/fs.h>
 	],[
 		struct inode ip;
-		struct timespec64 ts;
+		struct timespec64 ts = {0};
 
 		memset(&ip, 0, sizeof(ip));
 		inode_set_ctime_to_ts(&ip, ts);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [
 	AC_MSG_CHECKING([whether timestamp_truncate() exists])
 	ZFS_LINUX_TEST_RESULT([timestamp_truncate], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_INODE_TIMESTAMP_TRUNCATE, 1,
 		    [timestamp_truncate() exists])
 	],[
 		AC_MSG_RESULT(no)
 	])
 
 	AC_MSG_CHECKING([whether inode->i_*time's are timespec64])
 	ZFS_LINUX_TEST_RESULT([inode_times], [
 		AC_MSG_RESULT(no)
 	],[
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1,
 		    [inode->i_*time's are timespec64])
 	])
 
 	AC_MSG_CHECKING([whether inode_get_ctime() exists])
 	ZFS_LINUX_TEST_RESULT([inode_get_ctime], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_INODE_GET_CTIME, 1,
 		    [inode_get_ctime() exists in linux/fs.h])
 	],[
 		AC_MSG_RESULT(no)
 	])
 
 	AC_MSG_CHECKING([whether inode_set_ctime_to_ts() exists])
 	ZFS_LINUX_TEST_RESULT([inode_set_ctime_to_ts], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_INODE_SET_CTIME_TO_TS, 1,
 		    [inode_set_ctime_to_ts() exists in linux/fs.h])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
diff --git a/sys/contrib/openzfs/include/sys/dmu.h b/sys/contrib/openzfs/include/sys/dmu.h
index bddf395df7ee..921f51f27a20 100644
--- a/sys/contrib/openzfs/include/sys/dmu.h
+++ b/sys/contrib/openzfs/include/sys/dmu.h
@@ -1,1116 +1,1115 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/inttypes.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_priority.h>
 #include <sys/uio.h>
 #include <sys/zfs_file.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 struct dsl_crypto_params;
 struct locked_range;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_ENCRYPTED 0x20
 #define	DMU_OT_BYTESWAP_MASK 0x1f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
  * (dmu_object_byteswap_t). All of the types created by this method
  * are cached in the dbuf metadata cache.
  */
 #define	DMU_OT(byteswap, metadata, encrypted) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((encrypted) ? DMU_OT_ENCRYPTED : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
 
 /*
  * MDB doesn't have dmu_ot; it defines these macros itself.
  */
 #ifndef ZFS_MDB
 #define	DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata)
 #define	DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt)
 #define	DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap)
 #endif
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	(((ot) & DMU_OT_METADATA) != 0) : \
 	DMU_OT_IS_METADATA_IMPL(ot))
 
 #define	DMU_OT_IS_DDT(ot) \
 	((ot) == DMU_OT_DDT_ZAP)
 
 #define	DMU_OT_IS_CRITICAL(ot) \
 	(DMU_OT_IS_METADATA(ot) && \
 	(ot) != DMU_OT_DNODE && \
 	(ot) != DMU_OT_DIRECTORY_CONTENTS && \
 	(ot) != DMU_OT_SA)
 
 /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
 #define	DMU_OT_IS_FILE(ot) \
 	((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
 
 #define	DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	(((ot) & DMU_OT_ENCRYPTED) != 0) : \
 	DMU_OT_IS_ENCRYPTED_IMPL(ot))
 
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	DMU_OT_BYTESWAP_IMPL(ot))
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE),
 
 	DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE),
 	DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE),
 	DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE),
 	DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE),
 	DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE),
 	DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE),
 	DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE),
 	DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE),
 	DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE),
 	DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE),
 } dmu_object_type_t;
 
 /*
  * These flags are intended to be used to specify the "txg_how"
  * parameter when calling the dmu_tx_assign() function. See the comment
  * above dmu_tx_assign() for more details on the meaning of these flags.
  */
 #define	TXG_NOWAIT	(0ULL)
 #define	TXG_WAIT	(1ULL<<0)
 #define	TXG_NOTHROTTLE	(1ULL<<1)
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 #define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 #define	DMU_PROJECTUSED_OBJECT	(-3ULL)
 
 /*
  * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT.
  */
 #define	DMU_OBJACCT_PREFIX	"obj-"
 #define	DMU_OBJACCT_PREFIX_LEN	4
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg,
     cred_t *cr, dmu_tx_t *tx);
 
 int dmu_objset_hold(const char *name, const void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, boolean_t key_required, const void *tag,
     objset_t **osp);
 void dmu_objset_rele(objset_t *os, const void *tag);
 void dmu_objset_disown(objset_t *os, boolean_t key_required, const void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func,
     void *arg);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 #define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 #define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 #define	DMU_POOL_REMOVING		"com.delphix:removing"
 #define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
 #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
 #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
 #define	DMU_POOL_LOG_SPACEMAP_ZAP	"com.delphix:log_spacemap_zap"
 #define	DMU_POOL_DELETED_CLONES		"com.delphix:deleted_clones"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot,
     int blocksize, int indirect_blockshift, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag,
     dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
     dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx);
 int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the number of levels on a dnode. nlevels must be greater than the
  * current number of levels or an EINVAL will be returned.
  */
 int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels,
     dmu_tx_t *tx);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allocated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
  * to accommodate the change. When calling this function, the caller must
  * ensure that the object's nlevels can sufficiently support the new maxblkid.
  */
 int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_will_dirty()
  * before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release what you hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
     dmu_buf_t **dbp);
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
     const void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You must not access the dmu_buf_t after releasing
  * what you hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **, int flags);
 int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp);
 int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags);
 int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp, uint32_t flags);
 int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
     dmu_buf_t **dbp);
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
  */
 void dmu_buf_add_ref(dmu_buf_t *db, const void *tag);
 
 /*
  * Attempt to add a reference to a dmu buffer that is in an unknown state,
  * using a pointer that may have been invalidated by eviction processing.
  * The request will succeed if the passed in dbuf still represents the
  * same os/object/blkid, is ineligible for eviction, and has at least
  * one hold by a user other than the syncer.
  */
 boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
     uint64_t blkid, const void *tag);
 
 void dmu_buf_rele(dmu_buf_t *db, const void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 uint64_t dmu_buf_user_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, const void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
 
 /*
  * A DMU buffer user object may be associated with a dbuf for the
  * duration of its lifetime.  This allows the user of a dbuf (client)
  * to attach private data to a dbuf (e.g. in-core only data such as a
  * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
  * when that dbuf has been evicted.  Clients typically respond to the
  * eviction notification by freeing their private data, thus ensuring
  * the same lifetime for both dbuf and private data.
  *
  * The mapping from a dmu_buf_user_t to any client private data is the
  * client's responsibility.  All current consumers of the API with private
  * data embed a dmu_buf_user_t as the first member of the structure for
  * their private data.  This allows conversions between the two types
  * with a simple cast.  Since the DMU buf user API never needs access
  * to the private data, other strategies can be employed if necessary
  * or convenient for the client (e.g. using container_of() to do the
  * conversion for private data that cannot have the dmu_buf_user_t as
  * its first member).
  *
  * Eviction callbacks are executed without the dbuf mutex held or any
  * other type of mechanism to guarantee that the dbuf is still available.
  * For this reason, users must assume the dbuf has already been freed
  * and not reference the dbuf from the callback context.
  *
  * Users requesting "immediate eviction" are notified as soon as the dbuf
  * is only referenced by dirty records (dirties == holds).  Otherwise the
  * notification occurs after eviction processing for the dbuf begins.
  */
 typedef struct dmu_buf_user {
 	/*
 	 * Asynchronous user eviction callback state.
 	 */
 	taskq_ent_t	dbu_tqent;
 
 	/* Size of user data, for inclusion in dbuf_cache accounting. */
 	uint64_t	dbu_size;
 
 	/*
 	 * This instance's eviction function pointers.
 	 *
 	 * dbu_evict_func_sync is called synchronously and then
 	 * dbu_evict_func_async is executed asynchronously on a taskq.
 	 */
 	dmu_buf_evict_func_t *dbu_evict_func_sync;
 	dmu_buf_evict_func_t *dbu_evict_func_async;
 #ifdef ZFS_DEBUG
 	/*
 	 * Pointer to user's dbuf pointer.  NULL for clients that do
 	 * not associate a dbuf with their user data.
 	 *
 	 * The dbuf pointer is cleared upon eviction so as to catch
 	 * use-after-evict bugs in clients.
 	 */
 	dmu_buf_t **dbu_clear_on_evict_dbufp;
 #endif
 } dmu_buf_user_t;
 
 /*
  * Initialize the given dmu_buf_user_t instance with the eviction function
  * evict_func, to be called when the user is evicted.
  *
  * NOTE: This function should only be called once on a given dmu_buf_user_t.
  *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
 static inline void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
     dmu_buf_evict_func_t *evict_func_async,
     dmu_buf_t **clear_on_evict_dbufp __maybe_unused)
 {
 	ASSERT(dbu->dbu_evict_func_sync == NULL);
 	ASSERT(dbu->dbu_evict_func_async == NULL);
 
 	/* must have at least one evict func */
 	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
 	dbu->dbu_evict_func_sync = evict_func_sync;
 	dbu->dbu_evict_func_async = evict_func_async;
 	taskq_init_ent(&dbu->dbu_tqent);
 #ifdef ZFS_DEBUG
 	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
 #endif
 }
 
 /*
  * Attach user data to a dbuf and mark it for normal (when the dbuf's
  * data is cleared or its reference count goes to zero) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Attach user data to a dbuf and mark it for immediate (its dirty and
  * reference counts are equal) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Replace the current user of a dbuf.
  *
  * If given the current user of a dbuf, replaces the dbuf's user with
  * "new_user" and returns the user data pointer that was replaced.
  * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
 void *dmu_buf_replace_user(dmu_buf_t *db,
     dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
 
 /*
  * Remove the specified user data for a DMU buffer.
  *
  * Returns the user that was removed on success, or the current user if
  * another user currently owns the buffer.
  */
 void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * User data size accounting. This can be used to artifically inflate the size
  * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
  * to satisfy memory reclaim requests. It's not used for anything else, and
  * defaults to 0.
  */
 uint64_t dmu_buf_user_size(dmu_buf_t *db);
 void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
 void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
 
 /*
  * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
 dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
 void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
     const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  *
  * When multiple callbacks are registered to the transaction, the callbacks
  * will be called in reverse order to let Lustre, the only user of commit
  * callback currently, take the fast path of its commit callback handling.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 void dmu_tx_do_callbacks(list_t *cb_list, int error);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 #define	DMU_READ_NO_DECRYPT	2 /* don't decrypt */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 #ifdef _KERNEL
 int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size);
 int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx);
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx);
 int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx);
 #define	dmu_assign_arcbuf	dmu_assign_arcbuf_by_dbuf
 extern uint_t zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_dnodesize;
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	boolean_t		ot_dbuf_metadata_cache;
 	boolean_t		ot_encrypt;
 	const char		*ot_name;
 } dmu_object_type_info_t;
 
 typedef const struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	 ob_func;
 	const char		*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	uint8_t dds_redacted;
 	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern uint64_t dmu_objset_dnodesize(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_objset_blksize(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
 extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef struct zfs_file_info {
 	uint64_t zfi_user;
 	uint64_t zfi_group;
 	uint64_t zfi_project;
 	uint64_t zfi_generation;
 } zfs_file_info_t;
 
 typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data,
     struct zfs_file_info *zoi);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     file_info_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
 	struct lwb	*zgd_lwb;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct zfs_locked_range *zgd_lr;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, struct blkptr *bps, size_t *nbpsp);
 int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
-    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
-    boolean_t replay);
+    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     zfs_file_t *fp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern uint_t dmu_prefetch_max;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
diff --git a/sys/contrib/openzfs/lib/libspl/os/linux/zone.c b/sys/contrib/openzfs/lib/libspl/os/linux/zone.c
index 622d04cbc14a..f8a10bfa167a 100644
--- a/sys/contrib/openzfs/lib/libspl/os/linux/zone.c
+++ b/sys/contrib/openzfs/lib/libspl/os/linux/zone.c
@@ -1,62 +1,62 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2006 Ricardo Correia.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <unistd.h>
 #include <stdio.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <limits.h>
 #include <string.h>
 
 #include <zone.h>
 
 zoneid_t
 getzoneid(void)
 {
 	char path[PATH_MAX];
 	char buf[128] = { '\0' };
 	char *cp;
 
 	int c = snprintf(path, sizeof (path), "/proc/self/ns/user");
 	/* This API doesn't have any error checking... */
 	if (c < 0 || c >= sizeof (path))
-		return (0);
+		return (GLOBAL_ZONEID);
 
 	ssize_t r = readlink(path, buf, sizeof (buf) - 1);
 	if (r < 0)
-		return (0);
+		return (GLOBAL_ZONEID);
 
 	cp = strchr(buf, '[');
 	if (cp == NULL)
-		return (0);
+		return (GLOBAL_ZONEID);
 	cp++;
 
 	unsigned long n = strtoul(cp, NULL, 10);
 	if (n == ULONG_MAX && errno == ERANGE)
-		return (0);
+		return (GLOBAL_ZONEID);
 	zoneid_t z = (zoneid_t)n;
 
 	return (z);
 }
diff --git a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S
index 0001e4d69055..190dbabc5ecb 100644
--- a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S
+++ b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S
@@ -1,2769 +1,2776 @@
 /*
  * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 /*
  * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  * - modified assembly to fit into OpenZFS
  */
 
 #if defined(__arm__)
 
-#define	__ARM_ARCH__      7
-#define	__ARM_MAX_ARCH__  7
+#ifndef __ARM_ARCH
+# define __ARM_ARCH__	7
+#else
+# define __ARM_ARCH__	__ARM_ARCH
+#endif
 
 #if defined(__thumb2__)
 .syntax unified
 .thumb
 #else
 .code   32
 #endif
 
 .text
 
 .type	K256,%object
 .align	5
 K256:
 .word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 .word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 .word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 .word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 .word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 .word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 .word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 .word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 .word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 .word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 .word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 .word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 .word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
 
 .align	5
 .globl	zfs_sha256_block_armv7
 .type	zfs_sha256_block_armv7,%function
 zfs_sha256_block_armv7:
 .Lzfs_sha256_block_armv7:
 
 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r3,pc,#8		@ zfs_sha256_block_armv7
 #else
 	adr	r3,.Lzfs_sha256_block_armv7
 #endif
 
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
 	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
 	sub	r14,r3,#256+32	@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6		@ magic
 	eor	r12,r12,r12
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 0
 # if 0==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 0
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 0==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#0*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 0==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 0<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 1
 # if 1==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 1
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 1==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#1*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 1==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 1<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 2
 # if 2==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 2
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 2==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#2*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 2==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 2<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 3
 # if 3==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 3
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 3==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#3*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 3==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 3<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 4
 # if 4==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 4
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 4==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#4*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 4==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 4<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 5
 # if 5==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 5
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 5==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#5*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 5==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 5<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 6
 # if 6==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 6
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 6==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#6*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 6==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 6<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 7
 # if 7==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 7
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 7==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#7*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 7==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 7<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 8
 # if 8==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 8
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 8==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#8*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 8==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 8<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 9
 # if 9==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 9
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 9==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#9*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 9==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 9<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 10
 # if 10==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 10
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 10==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#10*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 10==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 10<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 11
 # if 11==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 11
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 11==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#11*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 11==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 11<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 12
 # if 12==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 12
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 12==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#12*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 12==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 12<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 13
 # if 13==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 13
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 13==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#13*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 13==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 13<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 14
 # if 14==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 14
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 14==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#14*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 14==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 14<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 15
 # if 15==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 15
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 15==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#15*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 15==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 15<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 .Lrounds_16_xx:
 	@ ldr	r2,[sp,#1*4]		@ 16
 	@ ldr	r1,[sp,#14*4]
 	mov	r0,r2,ror#7
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#0*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#9*4]
 
 	add	r12,r12,r0
 	eor	r0,r8,r8,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#0*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 16==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 16<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#2*4]		@ 17
 	@ ldr	r1,[sp,#15*4]
 	mov	r0,r2,ror#7
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#1*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#10*4]
 
 	add	r3,r3,r0
 	eor	r0,r7,r7,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#1*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 17==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 17<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#3*4]		@ 18
 	@ ldr	r1,[sp,#0*4]
 	mov	r0,r2,ror#7
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#2*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#11*4]
 
 	add	r12,r12,r0
 	eor	r0,r6,r6,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#2*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 18==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 18<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#4*4]		@ 19
 	@ ldr	r1,[sp,#1*4]
 	mov	r0,r2,ror#7
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#3*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#12*4]
 
 	add	r3,r3,r0
 	eor	r0,r5,r5,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#3*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 19==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 19<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#5*4]		@ 20
 	@ ldr	r1,[sp,#2*4]
 	mov	r0,r2,ror#7
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#4*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#13*4]
 
 	add	r12,r12,r0
 	eor	r0,r4,r4,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#4*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 20==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 20<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#6*4]		@ 21
 	@ ldr	r1,[sp,#3*4]
 	mov	r0,r2,ror#7
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#5*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#14*4]
 
 	add	r3,r3,r0
 	eor	r0,r11,r11,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#5*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 21==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 21<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#7*4]		@ 22
 	@ ldr	r1,[sp,#4*4]
 	mov	r0,r2,ror#7
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#6*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#15*4]
 
 	add	r12,r12,r0
 	eor	r0,r10,r10,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#6*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 22==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 22<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#8*4]		@ 23
 	@ ldr	r1,[sp,#5*4]
 	mov	r0,r2,ror#7
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#7*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#0*4]
 
 	add	r3,r3,r0
 	eor	r0,r9,r9,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#7*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 23==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 23<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#9*4]		@ 24
 	@ ldr	r1,[sp,#6*4]
 	mov	r0,r2,ror#7
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#8*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#1*4]
 
 	add	r12,r12,r0
 	eor	r0,r8,r8,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#8*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 24==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 24<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#10*4]		@ 25
 	@ ldr	r1,[sp,#7*4]
 	mov	r0,r2,ror#7
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#9*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#2*4]
 
 	add	r3,r3,r0
 	eor	r0,r7,r7,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#9*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 25==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 25<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#11*4]		@ 26
 	@ ldr	r1,[sp,#8*4]
 	mov	r0,r2,ror#7
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#10*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#3*4]
 
 	add	r12,r12,r0
 	eor	r0,r6,r6,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#10*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 26==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 26<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#12*4]		@ 27
 	@ ldr	r1,[sp,#9*4]
 	mov	r0,r2,ror#7
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#11*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#4*4]
 
 	add	r3,r3,r0
 	eor	r0,r5,r5,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#11*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 27==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 27<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#13*4]		@ 28
 	@ ldr	r1,[sp,#10*4]
 	mov	r0,r2,ror#7
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#12*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#5*4]
 
 	add	r12,r12,r0
 	eor	r0,r4,r4,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#12*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 28==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 28<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#14*4]		@ 29
 	@ ldr	r1,[sp,#11*4]
 	mov	r0,r2,ror#7
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#13*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#6*4]
 
 	add	r3,r3,r0
 	eor	r0,r11,r11,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#13*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 29==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 29<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#15*4]		@ 30
 	@ ldr	r1,[sp,#12*4]
 	mov	r0,r2,ror#7
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#14*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#7*4]
 
 	add	r12,r12,r0
 	eor	r0,r10,r10,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#14*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 30==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 30<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#0*4]		@ 31
 	@ ldr	r1,[sp,#13*4]
 	mov	r0,r2,ror#7
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#15*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#8*4]
 
 	add	r3,r3,r0
 	eor	r0,r9,r9,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#15*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 31==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 31<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 #ifdef	__thumb2__
 	ite	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	ldreq	r3,[sp,#16*4]		@ pull ctx
 	bne	.Lrounds_16_xx
 
 	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
 	ldr	r0,[r3,#0]
 	ldr	r2,[r3,#4]
 	ldr	r12,[r3,#8]
 	add	r4,r4,r0
 	ldr	r0,[r3,#12]
 	add	r5,r5,r2
 	ldr	r2,[r3,#16]
 	add	r6,r6,r12
 	ldr	r12,[r3,#20]
 	add	r7,r7,r0
 	ldr	r0,[r3,#24]
 	add	r8,r8,r2
 	ldr	r2,[r3,#28]
 	add	r9,r9,r12
 	ldr	r1,[sp,#17*4]		@ pull inp
 	ldr	r12,[sp,#18*4]		@ pull inp+len
 	add	r10,r10,r0
 	add	r11,r11,r2
 	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
 	cmp	r1,r12
 	sub	r14,r14,#256	@ rewind Ktbl
 	bne	.Loop
 
 	add	sp,sp,#19*4	@ destroy frame
 #if __ARM_ARCH__>=5
 	ldmia	sp!,{r4-r11,pc}
 #else
 	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
 #endif
 .size	zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
 
 .arch	armv7-a
 .fpu	neon
 
 .globl	zfs_sha256_block_neon
 .type	zfs_sha256_block_neon,%function
 .align	5
 .skip	16
 zfs_sha256_block_neon:
 .LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	r11,sp,#16*4+16
+#if __ARM_ARCH__ >=7
 	adr	r14,K256
+#else
+	ldr	r14,=K256
+#endif
 	bic	r11,r11,#15		@ align for 128-bit stores
 	mov	r12,sp
 	mov	sp,r11			@ alloca
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
 
 	vld1.8		{q0},[r1]!
 	vld1.8		{q1},[r1]!
 	vld1.8		{q2},[r1]!
 	vld1.8		{q3},[r1]!
 	vld1.32		{q8},[r14,:128]!
 	vld1.32		{q9},[r14,:128]!
 	vld1.32		{q10},[r14,:128]!
 	vld1.32		{q11},[r14,:128]!
 	vrev32.8	q0,q0		@ yes, even on
 	str		r0,[sp,#64]
 	vrev32.8	q1,q1		@ big-endian
 	str		r1,[sp,#68]
 	mov		r1,sp
 	vrev32.8	q2,q2
 	str		r2,[sp,#72]
 	vrev32.8	q3,q3
 	str		r12,[sp,#76]		@ save original sp
 	vadd.i32	q8,q8,q0
 	vadd.i32	q9,q9,q1
 	vst1.32		{q8},[r1,:128]!
 	vadd.i32	q10,q10,q2
 	vst1.32		{q9},[r1,:128]!
 	vadd.i32	q11,q11,q3
 	vst1.32		{q10},[r1,:128]!
 	vst1.32		{q11},[r1,:128]!
 
 	ldmia		r0,{r4-r11}
 	sub		r1,r1,#64
 	ldr		r2,[sp,#0]
 	eor		r12,r12,r12
 	eor		r3,r5,r6
 	b		.L_00_48
 
 .align	4
 .L_00_48:
 	vext.8	q8,q0,q1,#4
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	vext.8	q9,q2,q3,#4
 	add	r4,r4,r12
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vadd.i32	q0,q0,q9
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#4]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	veor	q9,q9,q10
 	add	r10,r10,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	vshr.u32	d24,d7,#17
 	add	r11,r11,r3
 	and	r2,r2,r7
 	veor	q9,q9,q11
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	vsli.32	d24,d7,#15
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	vshr.u32	d25,d7,#10
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	vadd.i32	q0,q0,q9
 	add	r10,r10,r2
 	ldr	r2,[sp,#8]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r6,r6,r10
 	vshr.u32	d24,d7,#19
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	vsli.32	d24,d7,#13
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	veor	d25,d25,d24
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	vadd.i32	d0,d0,d25
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	vshr.u32	d24,d0,#17
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	vsli.32	d24,d0,#15
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	vshr.u32	d25,d0,#10
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#12]
 	and	r3,r3,r12
 	vshr.u32	d24,d0,#19
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	vld1.32	{q8},[r14,:128]!
 	add	r8,r8,r2
 	vsli.32	d24,d0,#13
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	veor	d25,d25,d24
 	add	r9,r9,r3
 	and	r2,r2,r5
 	vadd.i32	d1,d1,d25
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	vadd.i32	q8,q8,q0
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#16]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	vst1.32	{q8},[r1,:128]!
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vext.8	q8,q1,q2,#4
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	vext.8	q9,q3,q0,#4
 	add	r8,r8,r12
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vadd.i32	q1,q1,q9
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#20]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	veor	q9,q9,q10
 	add	r6,r6,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	vshr.u32	d24,d1,#17
 	add	r7,r7,r3
 	and	r2,r2,r11
 	veor	q9,q9,q11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	vsli.32	d24,d1,#15
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	vshr.u32	d25,d1,#10
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	vadd.i32	q1,q1,q9
 	add	r6,r6,r2
 	ldr	r2,[sp,#24]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r10,r10,r6
 	vshr.u32	d24,d1,#19
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	vsli.32	d24,d1,#13
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	veor	d25,d25,d24
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	vadd.i32	d2,d2,d25
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	vshr.u32	d24,d2,#17
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	vsli.32	d24,d2,#15
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	vshr.u32	d25,d2,#10
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#28]
 	and	r3,r3,r12
 	vshr.u32	d24,d2,#19
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	vld1.32	{q8},[r14,:128]!
 	add	r4,r4,r2
 	vsli.32	d24,d2,#13
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	veor	d25,d25,d24
 	add	r5,r5,r3
 	and	r2,r2,r9
 	vadd.i32	d3,d3,d25
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	vadd.i32	q8,q8,q1
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[sp,#32]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	vst1.32	{q8},[r1,:128]!
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	vext.8	q8,q2,q3,#4
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	vext.8	q9,q0,q1,#4
 	add	r4,r4,r12
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vadd.i32	q2,q2,q9
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#36]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	veor	q9,q9,q10
 	add	r10,r10,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	vshr.u32	d24,d3,#17
 	add	r11,r11,r3
 	and	r2,r2,r7
 	veor	q9,q9,q11
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	vsli.32	d24,d3,#15
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	vshr.u32	d25,d3,#10
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	vadd.i32	q2,q2,q9
 	add	r10,r10,r2
 	ldr	r2,[sp,#40]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r6,r6,r10
 	vshr.u32	d24,d3,#19
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	vsli.32	d24,d3,#13
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	veor	d25,d25,d24
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	vadd.i32	d4,d4,d25
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	vshr.u32	d24,d4,#17
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	vsli.32	d24,d4,#15
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	vshr.u32	d25,d4,#10
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#44]
 	and	r3,r3,r12
 	vshr.u32	d24,d4,#19
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	vld1.32	{q8},[r14,:128]!
 	add	r8,r8,r2
 	vsli.32	d24,d4,#13
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	veor	d25,d25,d24
 	add	r9,r9,r3
 	and	r2,r2,r5
 	vadd.i32	d5,d5,d25
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	vadd.i32	q8,q8,q2
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#48]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	vst1.32	{q8},[r1,:128]!
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vext.8	q8,q3,q0,#4
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	vext.8	q9,q1,q2,#4
 	add	r8,r8,r12
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vadd.i32	q3,q3,q9
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#52]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	veor	q9,q9,q10
 	add	r6,r6,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	vshr.u32	d24,d5,#17
 	add	r7,r7,r3
 	and	r2,r2,r11
 	veor	q9,q9,q11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	vsli.32	d24,d5,#15
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	vshr.u32	d25,d5,#10
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	vadd.i32	q3,q3,q9
 	add	r6,r6,r2
 	ldr	r2,[sp,#56]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r10,r10,r6
 	vshr.u32	d24,d5,#19
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	vsli.32	d24,d5,#13
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	veor	d25,d25,d24
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	vadd.i32	d6,d6,d25
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	vshr.u32	d24,d6,#17
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	vsli.32	d24,d6,#15
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	vshr.u32	d25,d6,#10
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#60]
 	and	r3,r3,r12
 	vshr.u32	d24,d6,#19
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	vld1.32	{q8},[r14,:128]!
 	add	r4,r4,r2
 	vsli.32	d24,d6,#13
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	veor	d25,d25,d24
 	add	r5,r5,r3
 	and	r2,r2,r9
 	vadd.i32	d7,d7,d25
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	vadd.i32	q8,q8,q3
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[r14]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	vst1.32	{q8},[r1,:128]!
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	teq	r2,#0				@ check for K256 terminator
 	ldr	r2,[sp,#0]
 	sub	r1,r1,#64
 	bne	.L_00_48
 
 	ldr		r1,[sp,#68]
 	ldr		r0,[sp,#72]
 	sub		r14,r14,#256	@ rewind r14
 	teq		r1,r0
 	it		eq
 	subeq		r1,r1,#64		@ avoid SEGV
 	vld1.8		{q0},[r1]!		@ load next input block
 	vld1.8		{q1},[r1]!
 	vld1.8		{q2},[r1]!
 	vld1.8		{q3},[r1]!
 	it		ne
 	strne		r1,[sp,#68]
 	mov		r1,sp
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vrev32.8	q0,q0
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vadd.i32	q8,q8,q0
 	ldr	r2,[sp,#4]
 	and	r3,r3,r12
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	add	r10,r10,r2
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3
 	and	r2,r2,r7
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	add	r10,r10,r2
 	ldr	r2,[sp,#8]
 	and	r12,r12,r3
 	add	r6,r6,r10
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	ldr	r2,[sp,#12]
 	and	r3,r3,r12
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	add	r8,r8,r2
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3
 	and	r2,r2,r5
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#16]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vst1.32	{q8},[r1,:128]!
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vrev32.8	q1,q1
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vadd.i32	q8,q8,q1
 	ldr	r2,[sp,#20]
 	and	r3,r3,r12
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	add	r6,r6,r2
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3
 	and	r2,r2,r11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	add	r6,r6,r2
 	ldr	r2,[sp,#24]
 	and	r12,r12,r3
 	add	r10,r10,r6
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	ldr	r2,[sp,#28]
 	and	r3,r3,r12
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	add	r4,r4,r2
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3
 	and	r2,r2,r9
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[sp,#32]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	vst1.32	{q8},[r1,:128]!
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vrev32.8	q2,q2
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vadd.i32	q8,q8,q2
 	ldr	r2,[sp,#36]
 	and	r3,r3,r12
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	add	r10,r10,r2
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3
 	and	r2,r2,r7
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	add	r10,r10,r2
 	ldr	r2,[sp,#40]
 	and	r12,r12,r3
 	add	r6,r6,r10
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	ldr	r2,[sp,#44]
 	and	r3,r3,r12
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	add	r8,r8,r2
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3
 	and	r2,r2,r5
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#48]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vst1.32	{q8},[r1,:128]!
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vrev32.8	q3,q3
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vadd.i32	q8,q8,q3
 	ldr	r2,[sp,#52]
 	and	r3,r3,r12
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	add	r6,r6,r2
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3
 	and	r2,r2,r11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	add	r6,r6,r2
 	ldr	r2,[sp,#56]
 	and	r12,r12,r3
 	add	r10,r10,r6
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	ldr	r2,[sp,#60]
 	and	r3,r3,r12
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	add	r4,r4,r2
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3
 	and	r2,r2,r9
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[sp,#64]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	vst1.32	{q8},[r1,:128]!
 	ldr	r0,[r2,#0]
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	ldr	r12,[r2,#4]
 	ldr	r3,[r2,#8]
 	ldr	r1,[r2,#12]
 	add	r4,r4,r0			@ accumulate
 	ldr	r0,[r2,#16]
 	add	r5,r5,r12
 	ldr	r12,[r2,#20]
 	add	r6,r6,r3
 	ldr	r3,[r2,#24]
 	add	r7,r7,r1
 	ldr	r1,[r2,#28]
 	add	r8,r8,r0
 	str	r4,[r2],#4
 	add	r9,r9,r12
 	str	r5,[r2],#4
 	add	r10,r10,r3
 	str	r6,[r2],#4
 	add	r11,r11,r1
 	str	r7,[r2],#4
 	stmia	r2,{r8-r11}
 
 	ittte	ne
 	movne	r1,sp
 	ldrne	r2,[sp,#0]
 	eorne	r12,r12,r12
 	ldreq	sp,[sp,#76]			@ restore original sp
 	itt	ne
 	eorne	r3,r5,r6
 	bne	.L_00_48
 
 	ldmia	sp!,{r4-r12,pc}
 .size	zfs_sha256_block_neon,.-zfs_sha256_block_neon
 
 # if defined(__thumb2__)
 #  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
 # else
 #  define INST(a,b,c,d)	.byte	a,b,c,d
 # endif
 
 .globl	zfs_sha256_block_armv8
 .type	zfs_sha256_block_armv8,%function
 .align	5
 zfs_sha256_block_armv8:
 .LARMv8:
 	vld1.32	{q0,q1},[r0]
 	sub	r3,r3,#256+32
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
 	b	.Loop_v8
 
 .align	4
 .Loop_v8:
 	vld1.8		{q8-q9},[r1]!
 	vld1.8		{q10-q11},[r1]!
 	vld1.32		{q12},[r3]!
 	vrev32.8	q8,q8
 	vrev32.8	q9,q9
 	vrev32.8	q10,q10
 	vrev32.8	q11,q11
 	vmov		q14,q0	@ offload
 	vmov		q15,q1
 	teq		r1,r2
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q10
 	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q11
 	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q10
 	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q11
 	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q10
 	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q11
 	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 
 	vld1.32		{q13},[r3]
 	vadd.i32	q12,q12,q10
 	sub		r3,r3,#256-16	@ rewind
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 
 	vadd.i32	q13,q13,q11
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 
 	vadd.i32	q0,q0,q14
 	vadd.i32	q1,q1,q15
 	it		ne
 	bne		.Loop_v8
 
 	vst1.32		{q0,q1},[r0]
 
 	bx	lr		@ bx lr
 .size	zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
 
 #endif
diff --git a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S
index a4c804033b92..499cb6df9567 100644
--- a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S
+++ b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S
@@ -1,1822 +1,1825 @@
 /*
  * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 /*
  * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  * - modified assembly to fit into OpenZFS
  */
 
 #if defined(__arm__)
 
-#define	__ARM_ARCH__      7
-#define	__ARM_MAX_ARCH__  7
+#ifndef __ARM_ARCH
+# define __ARM_ARCH__	7
+#else
+# define __ARM_ARCH__	__ARM_ARCH
+#endif
 
 #ifndef __KERNEL__
 # define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
 # define VFP_ABI_POP	vldmia	sp!,{d8-d15}
 #else
 # define VFP_ABI_PUSH
 # define VFP_ABI_POP
 #endif
 
 #ifdef __ARMEL__
 # define LO 0
 # define HI 4
 # define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
 #else
 # define HI 0
 # define LO 4
 # define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
 #endif
 
 #if defined(__thumb2__)
 .syntax	unified
 .thumb
 # define adrl adr
 #else
 .code	32
 #endif
 
 .text
 
 .type	K512,%object
 .align	5
 K512:
 	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
 	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
 	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
 	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
 	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
 	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
 	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
 	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
 	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
 	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
 	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
 	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
 	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
 	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
 	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
 	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
 	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
 	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
 	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
 	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
 	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
 	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
 	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
 	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
 	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
 	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
 	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
 	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
 	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
 	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
 	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
 	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
 	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
 	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
 	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
 	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
 	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
 	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
 	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
 	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
 .size	K512,.-K512
 .word	0				@ terminator
 
 .align	5
 .globl	zfs_sha512_block_armv7
 .type	zfs_sha512_block_armv7,%function
 zfs_sha512_block_armv7:
 .Lzfs_sha512_block_armv7:
 
 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r3,pc,#8		@ zfs_sha512_block_armv7
 #else
 	adr	r3,.Lzfs_sha512_block_armv7
 #endif
 
 	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
 	sub	r14,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	r7,[r0,#32+LO]
 	ldr	r8,[r0,#32+HI]
 	ldr	r9, [r0,#48+LO]
 	ldr	r10, [r0,#48+HI]
 	ldr	r11, [r0,#56+LO]
 	ldr	r12, [r0,#56+HI]
 .Loop:
 	str	r9, [sp,#48+0]
 	str	r10, [sp,#48+4]
 	str	r11, [sp,#56+0]
 	str	r12, [sp,#56+4]
 	ldr	r5,[r0,#0+LO]
 	ldr	r6,[r0,#0+HI]
 	ldr	r3,[r0,#8+LO]
 	ldr	r4,[r0,#8+HI]
 	ldr	r9, [r0,#16+LO]
 	ldr	r10, [r0,#16+HI]
 	ldr	r11, [r0,#24+LO]
 	ldr	r12, [r0,#24+HI]
 	str	r3,[sp,#8+0]
 	str	r4,[sp,#8+4]
 	str	r9, [sp,#16+0]
 	str	r10, [sp,#16+4]
 	str	r11, [sp,#24+0]
 	str	r12, [sp,#24+4]
 	ldr	r3,[r0,#40+LO]
 	ldr	r4,[r0,#40+HI]
 	str	r3,[sp,#40+0]
 	str	r4,[sp,#40+4]
 
 .L00_15:
 #if __ARM_ARCH__<7
 	ldrb	r3,[r1,#7]
 	ldrb	r9, [r1,#6]
 	ldrb	r10, [r1,#5]
 	ldrb	r11, [r1,#4]
 	ldrb	r4,[r1,#3]
 	ldrb	r12, [r1,#2]
 	orr	r3,r3,r9,lsl#8
 	ldrb	r9, [r1,#1]
 	orr	r3,r3,r10,lsl#16
 	ldrb	r10, [r1],#8
 	orr	r3,r3,r11,lsl#24
 	orr	r4,r4,r12,lsl#8
 	orr	r4,r4,r9,lsl#16
 	orr	r4,r4,r10,lsl#24
 #else
 	ldr	r3,[r1,#4]
 	ldr	r4,[r1],#8
 #ifdef __ARMEL__
 	rev	r3,r3
 	rev	r4,r4
 #endif
 #endif
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
 	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
 	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
 	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
 	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
 	eor	r10,r10,r7,lsl#14
 	eor	r9,r9,r8,lsr#9
 	eor	r10,r10,r7,lsr#9
 	eor	r9,r9,r7,lsl#23
 	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
 	adds	r3,r3,r9
 	ldr	r9,[sp,#40+0]	@ f.lo
 	adc	r4,r4,r10		@ T += Sigma1(e)
 	ldr	r10,[sp,#40+4]	@ f.hi
 	adds	r3,r3,r11
 	ldr	r11,[sp,#48+0]	@ g.lo
 	adc	r4,r4,r12		@ T += h
 	ldr	r12,[sp,#48+4]	@ g.hi
 
 	eor	r9,r9,r11
 	str	r7,[sp,#32+0]
 	eor	r10,r10,r12
 	str	r8,[sp,#32+4]
 	and	r9,r9,r7
 	str	r5,[sp,#0+0]
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
 	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
 	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
 	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
 	ldr	r11,[sp,#8+0]	@ b.lo
 	adc	r8,r8,r4		@ d += T
 	teq	r9,#148
 
 	ldr	r12,[sp,#16+0]	@ c.lo
 #ifdef	__thumb2__
 	it	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 	mov	r9,r5,lsr#28
 	mov	r10,r6,lsr#28
 	eor	r9,r9,r6,lsl#4
 	eor	r10,r10,r5,lsl#4
 	eor	r9,r9,r6,lsr#2
 	eor	r10,r10,r5,lsr#2
 	eor	r9,r9,r5,lsl#30
 	eor	r10,r10,r6,lsl#30
 	eor	r9,r9,r6,lsr#7
 	eor	r10,r10,r5,lsr#7
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
 	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
 	ldr	r10,[sp,#8+4]	@ b.hi
 	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
 	and	r12,r6,r10
 	orr	r6,r6,r10
 	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
 	adds	r5,r5,r3
 	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
 	adc	r6,r6,r4		@ h += T
 	tst	r14,#1
 	add	r14,r14,#8
 	tst	r14,#1
 	beq	.L00_15
 	ldr	r9,[sp,#184+0]
 	ldr	r10,[sp,#184+4]
 	bic	r14,r14,#1
 .L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	r3,r9,lsr#1
 	ldr	r11,[sp,#80+0]
 	mov	r4,r10,lsr#1
 	ldr	r12,[sp,#80+4]
 	eor	r3,r3,r10,lsl#31
 	eor	r4,r4,r9,lsl#31
 	eor	r3,r3,r9,lsr#8
 	eor	r4,r4,r10,lsr#8
 	eor	r3,r3,r10,lsl#24
 	eor	r4,r4,r9,lsl#24
 	eor	r3,r3,r9,lsr#7
 	eor	r4,r4,r10,lsr#7
 	eor	r3,r3,r10,lsl#25
 
 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 	mov	r9,r11,lsr#19
 	mov	r10,r12,lsr#19
 	eor	r9,r9,r12,lsl#13
 	eor	r10,r10,r11,lsl#13
 	eor	r9,r9,r12,lsr#29
 	eor	r10,r10,r11,lsr#29
 	eor	r9,r9,r11,lsl#3
 	eor	r10,r10,r12,lsl#3
 	eor	r9,r9,r11,lsr#6
 	eor	r10,r10,r12,lsr#6
 	ldr	r11,[sp,#120+0]
 	eor	r9,r9,r12,lsl#26
 
 	ldr	r12,[sp,#120+4]
 	adds	r3,r3,r9
 	ldr	r9,[sp,#192+0]
 	adc	r4,r4,r10
 
 	ldr	r10,[sp,#192+4]
 	adds	r3,r3,r11
 	adc	r4,r4,r12
 	adds	r3,r3,r9
 	adc	r4,r4,r10
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
 	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
 	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
 	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
 	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
 	eor	r10,r10,r7,lsl#14
 	eor	r9,r9,r8,lsr#9
 	eor	r10,r10,r7,lsr#9
 	eor	r9,r9,r7,lsl#23
 	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
 	adds	r3,r3,r9
 	ldr	r9,[sp,#40+0]	@ f.lo
 	adc	r4,r4,r10		@ T += Sigma1(e)
 	ldr	r10,[sp,#40+4]	@ f.hi
 	adds	r3,r3,r11
 	ldr	r11,[sp,#48+0]	@ g.lo
 	adc	r4,r4,r12		@ T += h
 	ldr	r12,[sp,#48+4]	@ g.hi
 
 	eor	r9,r9,r11
 	str	r7,[sp,#32+0]
 	eor	r10,r10,r12
 	str	r8,[sp,#32+4]
 	and	r9,r9,r7
 	str	r5,[sp,#0+0]
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
 	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
 	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
 	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
 	ldr	r11,[sp,#8+0]	@ b.lo
 	adc	r8,r8,r4		@ d += T
 	teq	r9,#23
 
 	ldr	r12,[sp,#16+0]	@ c.lo
 #ifdef	__thumb2__
 	it	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 	mov	r9,r5,lsr#28
 	mov	r10,r6,lsr#28
 	eor	r9,r9,r6,lsl#4
 	eor	r10,r10,r5,lsl#4
 	eor	r9,r9,r6,lsr#2
 	eor	r10,r10,r5,lsr#2
 	eor	r9,r9,r5,lsl#30
 	eor	r10,r10,r6,lsl#30
 	eor	r9,r9,r6,lsr#7
 	eor	r10,r10,r5,lsr#7
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
 	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
 	ldr	r10,[sp,#8+4]	@ b.hi
 	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
 	and	r12,r6,r10
 	orr	r6,r6,r10
 	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
 	adds	r5,r5,r3
 	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
 	adc	r6,r6,r4		@ h += T
 	tst	r14,#1
 	add	r14,r14,#8
 #ifdef	__thumb2__
 	ittt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	ldreq	r9,[sp,#184+0]
 	ldreq	r10,[sp,#184+4]
 	beq	.L16_79
 	bic	r14,r14,#1
 
 	ldr	r3,[sp,#8+0]
 	ldr	r4,[sp,#8+4]
 	ldr	r9, [r0,#0+LO]
 	ldr	r10, [r0,#0+HI]
 	ldr	r11, [r0,#8+LO]
 	ldr	r12, [r0,#8+HI]
 	adds	r9,r5,r9
 	str	r9, [r0,#0+LO]
 	adc	r10,r6,r10
 	str	r10, [r0,#0+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#8+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#8+HI]
 
 	ldr	r5,[sp,#16+0]
 	ldr	r6,[sp,#16+4]
 	ldr	r3,[sp,#24+0]
 	ldr	r4,[sp,#24+4]
 	ldr	r9, [r0,#16+LO]
 	ldr	r10, [r0,#16+HI]
 	ldr	r11, [r0,#24+LO]
 	ldr	r12, [r0,#24+HI]
 	adds	r9,r5,r9
 	str	r9, [r0,#16+LO]
 	adc	r10,r6,r10
 	str	r10, [r0,#16+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#24+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#24+HI]
 
 	ldr	r3,[sp,#40+0]
 	ldr	r4,[sp,#40+4]
 	ldr	r9, [r0,#32+LO]
 	ldr	r10, [r0,#32+HI]
 	ldr	r11, [r0,#40+LO]
 	ldr	r12, [r0,#40+HI]
 	adds	r7,r7,r9
 	str	r7,[r0,#32+LO]
 	adc	r8,r8,r10
 	str	r8,[r0,#32+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#40+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#40+HI]
 
 	ldr	r5,[sp,#48+0]
 	ldr	r6,[sp,#48+4]
 	ldr	r3,[sp,#56+0]
 	ldr	r4,[sp,#56+4]
 	ldr	r9, [r0,#48+LO]
 	ldr	r10, [r0,#48+HI]
 	ldr	r11, [r0,#56+LO]
 	ldr	r12, [r0,#56+HI]
 	adds	r9,r5,r9
 	str	r9, [r0,#48+LO]
 	adc	r10,r6,r10
 	str	r10, [r0,#48+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#56+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#56+HI]
 
 	add	sp,sp,#640
 	sub	r14,r14,#640
 
 	teq	r1,r2
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
 
 #if __ARM_ARCH__>=5
 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
 #else
 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 .word	0xe12fff1e			@ interoperable with Thumb ISA:-)
 #endif
 .size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
 
 .arch	armv7-a
 .fpu	neon
 
 .globl	zfs_sha512_block_neon
 .type	zfs_sha512_block_neon,%function
 .align	4
 zfs_sha512_block_neon:
 .LNEON:
 	dmb	@ errata #451034 on early Cortex A8
 	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
 	adr	r3,K512
 	VFP_ABI_PUSH
 	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
 .Loop_neon:
 	vshr.u64	d24,d20,#14	@ 0
 #if 0<16
 	vld1.64	{d0},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d20,#18
 #if 0>0
 	vadd.i64	d16,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d20,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 0<16 && defined(__ARMEL__)
 	vrev64.8	d0,d0
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d0
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 1
 #if 1<16
 	vld1.64	{d1},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 1>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 1<16 && defined(__ARMEL__)
 	vrev64.8	d1,d1
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d1
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	d24,d18,#14	@ 2
 #if 2<16
 	vld1.64	{d2},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d18,#18
 #if 2>0
 	vadd.i64	d22,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d18,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 2<16 && defined(__ARMEL__)
 	vrev64.8	d2,d2
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d2
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 3
 #if 3<16
 	vld1.64	{d3},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 3>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 3<16 && defined(__ARMEL__)
 	vrev64.8	d3,d3
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d3
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	d24,d16,#14	@ 4
 #if 4<16
 	vld1.64	{d4},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d16,#18
 #if 4>0
 	vadd.i64	d20,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d16,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 4<16 && defined(__ARMEL__)
 	vrev64.8	d4,d4
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d4
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 5
 #if 5<16
 	vld1.64	{d5},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 5>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 5<16 && defined(__ARMEL__)
 	vrev64.8	d5,d5
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d5
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	d24,d22,#14	@ 6
 #if 6<16
 	vld1.64	{d6},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d22,#18
 #if 6>0
 	vadd.i64	d18,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d22,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 6<16 && defined(__ARMEL__)
 	vrev64.8	d6,d6
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d6
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 7
 #if 7<16
 	vld1.64	{d7},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 7>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 7<16 && defined(__ARMEL__)
 	vrev64.8	d7,d7
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d7
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	vshr.u64	d24,d20,#14	@ 8
 #if 8<16
 	vld1.64	{d8},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d20,#18
 #if 8>0
 	vadd.i64	d16,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d20,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 8<16 && defined(__ARMEL__)
 	vrev64.8	d8,d8
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d8
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 9
 #if 9<16
 	vld1.64	{d9},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 9>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 9<16 && defined(__ARMEL__)
 	vrev64.8	d9,d9
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d9
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	d24,d18,#14	@ 10
 #if 10<16
 	vld1.64	{d10},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d18,#18
 #if 10>0
 	vadd.i64	d22,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d18,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 10<16 && defined(__ARMEL__)
 	vrev64.8	d10,d10
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d10
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 11
 #if 11<16
 	vld1.64	{d11},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 11>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 11<16 && defined(__ARMEL__)
 	vrev64.8	d11,d11
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d11
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	d24,d16,#14	@ 12
 #if 12<16
 	vld1.64	{d12},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d16,#18
 #if 12>0
 	vadd.i64	d20,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d16,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 12<16 && defined(__ARMEL__)
 	vrev64.8	d12,d12
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d12
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 13
 #if 13<16
 	vld1.64	{d13},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 13>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 13<16 && defined(__ARMEL__)
 	vrev64.8	d13,d13
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d13
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	d24,d22,#14	@ 14
 #if 14<16
 	vld1.64	{d14},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d22,#18
 #if 14>0
 	vadd.i64	d18,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d22,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 14<16 && defined(__ARMEL__)
 	vrev64.8	d14,d14
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d14
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 15
 #if 15<16
 	vld1.64	{d15},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 15>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 15<16 && defined(__ARMEL__)
 	vrev64.8	d15,d15
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d15
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	mov	r12,#4
 .L16_79_neon:
 	subs	r12,#1
 	vshr.u64	q12,q7,#19
 	vshr.u64	q13,q7,#61
 	vadd.i64	d16,d30			@ h+=Maj from the past
 	vshr.u64	q15,q7,#6
 	vsli.64	q12,q7,#45
 	vext.8	q14,q0,q1,#8	@ X[i+1]
 	vsli.64	q13,q7,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q0,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q4,q5,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d20,#14		@ from NEON_00_15
 	vadd.i64	q0,q14
 	vshr.u64	d25,d20,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d20,#41		@ from NEON_00_15
 	vadd.i64	q0,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 16<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d0
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 17
 #if 17<16
 	vld1.64	{d1},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 17>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 17<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d1
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	q12,q0,#19
 	vshr.u64	q13,q0,#61
 	vadd.i64	d22,d30			@ h+=Maj from the past
 	vshr.u64	q15,q0,#6
 	vsli.64	q12,q0,#45
 	vext.8	q14,q1,q2,#8	@ X[i+1]
 	vsli.64	q13,q0,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q1,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q5,q6,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d18,#14		@ from NEON_00_15
 	vadd.i64	q1,q14
 	vshr.u64	d25,d18,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d18,#41		@ from NEON_00_15
 	vadd.i64	q1,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 18<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d2
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 19
 #if 19<16
 	vld1.64	{d3},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 19>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 19<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d3
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	q12,q1,#19
 	vshr.u64	q13,q1,#61
 	vadd.i64	d20,d30			@ h+=Maj from the past
 	vshr.u64	q15,q1,#6
 	vsli.64	q12,q1,#45
 	vext.8	q14,q2,q3,#8	@ X[i+1]
 	vsli.64	q13,q1,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q2,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q6,q7,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d16,#14		@ from NEON_00_15
 	vadd.i64	q2,q14
 	vshr.u64	d25,d16,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d16,#41		@ from NEON_00_15
 	vadd.i64	q2,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 20<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d4
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 21
 #if 21<16
 	vld1.64	{d5},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 21>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 21<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d5
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	q12,q2,#19
 	vshr.u64	q13,q2,#61
 	vadd.i64	d18,d30			@ h+=Maj from the past
 	vshr.u64	q15,q2,#6
 	vsli.64	q12,q2,#45
 	vext.8	q14,q3,q4,#8	@ X[i+1]
 	vsli.64	q13,q2,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q3,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q7,q0,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d22,#14		@ from NEON_00_15
 	vadd.i64	q3,q14
 	vshr.u64	d25,d22,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d22,#41		@ from NEON_00_15
 	vadd.i64	q3,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 22<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d6
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 23
 #if 23<16
 	vld1.64	{d7},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 23>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 23<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d7
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	vshr.u64	q12,q3,#19
 	vshr.u64	q13,q3,#61
 	vadd.i64	d16,d30			@ h+=Maj from the past
 	vshr.u64	q15,q3,#6
 	vsli.64	q12,q3,#45
 	vext.8	q14,q4,q5,#8	@ X[i+1]
 	vsli.64	q13,q3,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q4,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q0,q1,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d20,#14		@ from NEON_00_15
 	vadd.i64	q4,q14
 	vshr.u64	d25,d20,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d20,#41		@ from NEON_00_15
 	vadd.i64	q4,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 24<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d8
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 25
 #if 25<16
 	vld1.64	{d9},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 25>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 25<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d9
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	q12,q4,#19
 	vshr.u64	q13,q4,#61
 	vadd.i64	d22,d30			@ h+=Maj from the past
 	vshr.u64	q15,q4,#6
 	vsli.64	q12,q4,#45
 	vext.8	q14,q5,q6,#8	@ X[i+1]
 	vsli.64	q13,q4,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q5,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q1,q2,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d18,#14		@ from NEON_00_15
 	vadd.i64	q5,q14
 	vshr.u64	d25,d18,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d18,#41		@ from NEON_00_15
 	vadd.i64	q5,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 26<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d10
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 27
 #if 27<16
 	vld1.64	{d11},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 27>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 27<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d11
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	q12,q5,#19
 	vshr.u64	q13,q5,#61
 	vadd.i64	d20,d30			@ h+=Maj from the past
 	vshr.u64	q15,q5,#6
 	vsli.64	q12,q5,#45
 	vext.8	q14,q6,q7,#8	@ X[i+1]
 	vsli.64	q13,q5,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q6,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q2,q3,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d16,#14		@ from NEON_00_15
 	vadd.i64	q6,q14
 	vshr.u64	d25,d16,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d16,#41		@ from NEON_00_15
 	vadd.i64	q6,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 28<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d12
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 29
 #if 29<16
 	vld1.64	{d13},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 29>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 29<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d13
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	q12,q6,#19
 	vshr.u64	q13,q6,#61
 	vadd.i64	d18,d30			@ h+=Maj from the past
 	vshr.u64	q15,q6,#6
 	vsli.64	q12,q6,#45
 	vext.8	q14,q7,q0,#8	@ X[i+1]
 	vsli.64	q13,q6,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q7,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q3,q4,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d22,#14		@ from NEON_00_15
 	vadd.i64	q7,q14
 	vshr.u64	d25,d22,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d22,#41		@ from NEON_00_15
 	vadd.i64	q7,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 30<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d14
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 31
 #if 31<16
 	vld1.64	{d15},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 31>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 31<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d15
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	bne	.L16_79_neon
 
 	vadd.i64	d16,d30		@ h+=Maj from the past
 	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
 	vadd.i64	q8,q12		@ vectorized accumulate
 	vadd.i64	q9,q13
 	vadd.i64	q10,q14
 	vadd.i64	q11,q15
 	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
 	teq	r1,r2
 	sub	r3,#640	@ rewind K512
 	bne	.Loop_neon
 
 	VFP_ABI_POP
 	bx	lr				@ .word	0xe12fff1e
 .size	zfs_sha512_block_neon,.-zfs_sha512_block_neon
 #endif
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
index 74755eb6d9ef..2b62abcccb78 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zio_crypt.c
@@ -1,1826 +1,1831 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017, Datto, Inc. All rights reserved.
  */
 
 #include <sys/zio_crypt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sha2.h>
 #include <sys/hkdf.h>
 
 /*
  * This file is responsible for handling all of the details of generating
  * encryption parameters and performing encryption and authentication.
  *
  * BLOCK ENCRYPTION PARAMETERS:
  * Encryption /Authentication Algorithm Suite (crypt):
  * The encryption algorithm, mode, and key length we are going to use. We
  * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
  * keys. All authentication is currently done with SHA512-HMAC.
  *
  * Plaintext:
  * The unencrypted data that we want to encrypt.
  *
  * Initialization Vector (IV):
  * An initialization vector for the encryption algorithms. This is used to
  * "tweak" the encryption algorithms so that two blocks of the same data are
  * encrypted into different ciphertext outputs, thus obfuscating block patterns.
  * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
  * never reused with the same encryption key. This value is stored unencrypted
  * and must simply be provided to the decryption function. We use a 96 bit IV
  * (as recommended by NIST) for all block encryption. For non-dedup blocks we
  * derive the IV randomly. The first 64 bits of the IV are stored in the second
  * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
  * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
  * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
  * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
  * level 0 blocks is the number of allocated dnodes in that block. The on-disk
  * format supports at most 2^15 slots per L0 dnode block, because the maximum
  * block size is 16MB (2^24). In either case, for level 0 blocks this number
  * will still be smaller than UINT32_MAX so it is safe to store the IV in the
  * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
  * for the dnode code.
  *
  * Master key:
  * This is the most important secret data of an encrypted dataset. It is used
  * along with the salt to generate that actual encryption keys via HKDF. We
  * do not use the master key to directly encrypt any data because there are
  * theoretical limits on how much data can actually be safely encrypted with
  * any encryption mode. The master key is stored encrypted on disk with the
  * user's wrapping key. Its length is determined by the encryption algorithm.
  * For details on how this is stored see the block comment in dsl_crypt.c
  *
  * Salt:
  * Used as an input to the HKDF function, along with the master key. We use a
  * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
  * can be used for encrypting many blocks, so we cache the current salt and the
  * associated derived key in zio_crypt_t so we do not need to derive it again
  * needlessly.
  *
  * Encryption Key:
  * A secret binary key, generated from an HKDF function used to encrypt and
  * decrypt data.
  *
  * Message Authentication Code (MAC)
  * The MAC is an output of authenticated encryption modes such as AES-GCM and
  * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
  * data on disk and return garbage to the application. Effectively, it is a
  * checksum that can not be reproduced by an attacker. We store the MAC in the
  * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
  * regular checksum of the ciphertext which can be used for scrubbing.
  *
  * OBJECT AUTHENTICATION:
  * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
  * they contain some info that always needs to be readable. To prevent this
  * data from being altered, we authenticate this data using SHA512-HMAC. This
  * will produce a MAC (similar to the one produced via encryption) which can
  * be used to verify the object was not modified. HMACs do not require key
  * rotation or IVs, so we can keep up to the full 3 copies of authenticated
  * data.
  *
  * ZIL ENCRYPTION:
  * ZIL blocks have their bp written to disk ahead of the associated data, so we
  * cannot store the MAC there as we normally do. For these blocks the MAC is
  * stored in the embedded checksum within the zil_chain_t header. The salt and
  * IV are generated for the block on bp allocation instead of at encryption
  * time. In addition, ZIL blocks have some pieces that must be left in plaintext
  * for claiming even though all of the sensitive user data still needs to be
  * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
  * pieces of the block need to be encrypted. All data that is not encrypted is
  * authenticated using the AAD mechanisms that the supported encryption modes
  * provide for. In order to preserve the semantics of the ZIL for encrypted
  * datasets, the ZIL is not protected at the objset level as described below.
  *
  * DNODE ENCRYPTION:
  * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
  * in plaintext for scrubbing and claiming, but the bonus buffers might contain
  * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
  * which pieces of the block need to be encrypted. For more details about
  * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
  *
  * OBJECT SET AUTHENTICATION:
  * Up to this point, everything we have encrypted and authenticated has been
  * at level 0 (or -2 for the ZIL). If we did not do any further work the
  * on-disk format would be susceptible to attacks that deleted or rearranged
  * the order of level 0 blocks. Ideally, the cleanest solution would be to
  * maintain a tree of authentication MACs going up the bp tree. However, this
  * presents a problem for raw sends. Send files do not send information about
  * indirect blocks so there would be no convenient way to transfer the MACs and
  * they cannot be recalculated on the receive side without the master key which
  * would defeat one of the purposes of raw sends in the first place. Instead,
  * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
  * from the level below. We also include some portable fields from blk_prop such
  * as the lsize and compression algorithm to prevent the data from being
  * misinterpreted.
  *
  * At the objset level, we maintain 2 separate 256 bit MACs in the
  * objset_phys_t. The first one is "portable" and is the logical root of the
  * MAC tree maintained in the metadnode's bps. The second, is "local" and is
  * used as the root MAC for the user accounting objects, which are also not
  * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
  * of the send file. The useraccounting code ensures that the useraccounting
  * info is not present upon a receive, so the local MAC can simply be cleared
  * out at that time. For more info about objset_phys_t authentication, see
  * zio_crypt_do_objset_hmacs().
  *
  * CONSIDERATIONS FOR DEDUP:
  * In order for dedup to work, blocks that we want to dedup with one another
  * need to use the same IV and encryption key, so that they will have the same
  * ciphertext. Normally, one should never reuse an IV with the same encryption
  * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
  * blocks. In this case, however, since we are using the same plaintext as
  * well all that we end up with is a duplicate of the original ciphertext we
  * already had. As a result, an attacker with read access to the raw disk will
  * be able to tell which blocks are the same but this information is given away
  * by dedup anyway. In order to get the same IVs and encryption keys for
  * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
  * here so that a reproducible checksum of the plaintext is never available to
  * the attacker. The HMAC key is kept alongside the master key, encrypted on
  * disk. The first 64 bits of the HMAC are used in place of the random salt, and
  * the next 96 bits are used as the IV. As a result of this mechanism, dedup
  * will only work within a clone family since encrypted dedup requires use of
  * the same master and HMAC keys.
  */
 
 /*
  * After encrypting many blocks with the same key we may start to run up
  * against the theoretical limits of how much data can securely be encrypted
  * with a single key using the supported encryption modes. The most obvious
  * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
  * the more IVs we generate (which both GCM and CCM modes strictly forbid).
  * This risk actually grows surprisingly quickly over time according to the
  * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
  * generated n IVs with a cryptographically secure RNG, the approximate
  * probability p(n) of a collision is given as:
  *
  * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
  *
  * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
  *
  * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
  * we must not write more than 398,065,730 blocks with the same encryption key.
  * Therefore, we rotate our keys after 400,000,000 blocks have been written by
  * generating a new random 64 bit salt for our HKDF encryption key generation
  * function.
  */
 #define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
 #define	ZFS_CURRENT_MAX_SALT_USES	\
 	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
 static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
 
 typedef struct blkptr_auth_buf {
 	uint64_t bab_prop;			/* blk_prop - portable mask */
 	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
 	uint64_t bab_pad;			/* reserved for future use */
 } blkptr_auth_buf_t;
 
 const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
 	{"",			ZC_TYPE_NONE,	0,	"inherit"},
 	{"",			ZC_TYPE_NONE,	0,	"on"},
 	{"",			ZC_TYPE_NONE,	0,	"off"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
 };
 
 static void
 zio_crypt_key_destroy_early(zio_crypt_key_t *key)
 {
 	rw_destroy(&key->zk_salt_lock);
 
 	/* free crypto templates */
 	memset(&key->zk_session, 0, sizeof (key->zk_session));
 
 	/* zero out sensitive data */
 	memset(key, 0, sizeof (zio_crypt_key_t));
 }
 
 void
 zio_crypt_key_destroy(zio_crypt_key_t *key)
 {
 
 	freebsd_crypt_freesession(&key->zk_session);
 	zio_crypt_key_destroy_early(key);
 }
 
 int
 zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
 {
 	int ret;
 	crypto_mechanism_t mech __unused;
 	uint_t keydata_len;
 	const zio_crypt_info_t *ci = NULL;
 
 	ASSERT3P(key, !=, NULL);
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	memset(key, 0, sizeof (zio_crypt_key_t));
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	/* fill keydata buffers and salt with random data */
 	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for the ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 	ret = freebsd_crypt_newsession(&key->zk_session, ci,
 	    &key->zk_current_key);
 	if (ret)
 		goto error;
 
 	key->zk_crypt = crypt;
 	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy_early(key);
 	return (ret);
 }
 
 static int
 zio_crypt_key_change_salt(zio_crypt_key_t *key)
 {
 	int ret = 0;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	crypto_mechanism_t mech __unused;
 
 	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
 
 	/* generate a new salt */
 	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	rw_enter(&key->zk_salt_lock, RW_WRITER);
 
 	/* someone beat us to the salt rotation, just unlock and return */
 	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
 		goto out_unlock;
 
 	/* derive the current key from the master key and the new salt */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
 	if (ret != 0)
 		goto out_unlock;
 
 	/* assign the salt and reset the usage count */
 	memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
 	key->zk_salt_count = 0;
 
 	freebsd_crypt_freesession(&key->zk_session);
 	ret = freebsd_crypt_newsession(&key->zk_session,
 	    &zio_crypt_table[key->zk_crypt], &key->zk_current_key);
 	if (ret != 0)
 		goto out_unlock;
 
 	rw_exit(&key->zk_salt_lock);
 
 	return (0);
 
 out_unlock:
 	rw_exit(&key->zk_salt_lock);
 error:
 	return (ret);
 }
 
 /* See comment above zfs_key_max_salt_uses definition for details */
 int
 zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
 {
 	int ret;
 	boolean_t salt_change;
 
 	rw_enter(&key->zk_salt_lock, RW_READER);
 
 	memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
 	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
 	    ZFS_CURRENT_MAX_SALT_USES);
 
 	rw_exit(&key->zk_salt_lock);
 
 	if (salt_change) {
 		ret = zio_crypt_key_change_salt(key);
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 void *failed_decrypt_buf;
 int failed_decrypt_size;
 
 /*
  * This function handles all encryption and decryption in zfs. When
  * encrypting it expects puio to reference the plaintext and cuio to
  * reference the ciphertext. cuio must have enough space for the
  * ciphertext + room for a MAC. datalen should be the length of the
  * plaintext / ciphertext alone.
  */
 /*
  * The implementation for FreeBSD's OpenCrypto.
  *
  * The big difference between ICP and FOC is that FOC uses a single
  * buffer for input and output.  This means that (for AES-GCM, the
  * only one supported right now) the source must be copied into the
  * destination, and the destination must have the AAD, and the tag/MAC,
  * already associated with it.  (Both implementations can use a uio.)
  *
  * Since the auth data is part of the iovec array, all we need to know
  * is the length:  0 means there's no AAD.
  *
  */
 static int
 zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess,
     uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen,
     zfs_uio_t *uio, uint_t auth_len)
 {
 	const zio_crypt_info_t *ci = &zio_crypt_table[crypt];
 	if (ci->ci_crypt_type != ZC_TYPE_GCM &&
 	    ci->ci_crypt_type != ZC_TYPE_CCM)
 		return (ENOTSUP);
 
 
 	int ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf,
 	    datalen, auth_len);
 	if (ret != 0) {
 #ifdef FCRYPTO_DEBUG
 		printf("%s(%d):  Returning error %s\n",
 		    __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM");
 #endif
 		ret = SET_ERROR(encrypt ? EIO : ECKSUM);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
     uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
 {
 	int ret;
 	uint64_t aad[3];
 	/*
 	 * With OpenCrypto in FreeBSD, the same buffer is used for
 	 * input and output.  Also, the AAD (for AES-GMC at least)
 	 * needs to logically go in front.
 	 */
 	zfs_uio_t cuio;
 	struct uio cuio_s;
 	iovec_t iovecs[4];
 	uint64_t crypt = key->zk_crypt;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	zfs_uio_init(&cuio, &cuio_s);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* generate iv for wrapping the master and hmac key */
 	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	/*
 	 * Since we only support one buffer, we need to copy
 	 * the plain text (source) to the cipher buffer (dest).
 	 * We set iovecs[0] -- the authentication data -- below.
 	 */
 	memcpy(keydata_out, key->zk_master_keydata, keydata_len);
 	memcpy(hmac_keydata_out, key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	iovecs[1].iov_base = keydata_out;
 	iovecs[1].iov_len = keydata_len;
 	iovecs[2].iov_base = hmac_keydata_out;
 	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
 	iovecs[3].iov_base = mac;
 	iovecs[3].iov_len = WRAPPING_MAC_LEN;
 
 	/*
 	 * Although we don't support writing to the old format, we do
 	 * support rewrapping the key so that the user can move and
 	 * quarantine datasets on the old format.
 	 */
 	if (key->zk_version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(key->zk_guid);
 	} else {
 		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(key->zk_guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(key->zk_version);
 	}
 
 	iovecs[0].iov_base = aad;
 	iovecs[0].iov_len = aad_len;
 	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
 
 	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
 	zfs_uio_iovcnt(&cuio) = 4;
 	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
 
 	/* encrypt the keys and store the resulting ciphertext and mac */
 	ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey,
 	    iv, enc_len, &cuio, aad_len);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
     uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
     uint8_t *mac, zio_crypt_key_t *key)
 {
 	int ret;
 	uint64_t aad[3];
 	/*
 	 * With OpenCrypto in FreeBSD, the same buffer is used for
 	 * input and output.  Also, the AAD (for AES-GMC at least)
 	 * needs to logically go in front.
 	 */
 	zfs_uio_t cuio;
 	struct uio cuio_s;
 	iovec_t iovecs[4];
 	void *src, *dst;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	zfs_uio_init(&cuio, &cuio_s);
 
 	/*
 	 * Since we only support one buffer, we need to copy
 	 * the encrypted buffer (source) to the plain buffer
 	 * (dest).  We set iovecs[0] -- the authentication data --
 	 * below.
 	 */
 	dst = key->zk_master_keydata;
 	src = keydata;
 	memcpy(dst, src, keydata_len);
 
 	dst = key->zk_hmac_keydata;
 	src = hmac_keydata;
 	memcpy(dst, src, SHA512_HMAC_KEYLEN);
 
 	iovecs[1].iov_base = key->zk_master_keydata;
 	iovecs[1].iov_len = keydata_len;
 	iovecs[2].iov_base = key->zk_hmac_keydata;
 	iovecs[2].iov_len = SHA512_HMAC_KEYLEN;
 	iovecs[3].iov_base = mac;
 	iovecs[3].iov_len = WRAPPING_MAC_LEN;
 
 	if (version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(guid);
 	} else {
 		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(version);
 	}
 
 	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
 	iovecs[0].iov_base = aad;
 	iovecs[0].iov_len = aad_len;
 
 	GET_UIO_STRUCT(&cuio)->uio_iov = iovecs;
 	zfs_uio_iovcnt(&cuio) = 4;
 	zfs_uio_segflg(&cuio) = UIO_SYSSPACE;
 
 	/* decrypt the keys and store the result in the output buffers */
 	ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey,
 	    iv, enc_len, &cuio, aad_len);
 
 	if (ret != 0)
 		goto error;
 
 	/* generate a fresh salt */
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	ret = freebsd_crypt_newsession(&key->zk_session,
 	    &zio_crypt_table[crypt], &key->zk_current_key);
 	if (ret != 0)
 		goto error;
 
 	key->zk_crypt = crypt;
 	key->zk_version = version;
 	key->zk_guid = guid;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy_early(key);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv(uint8_t *ivbuf)
 {
 	int ret;
 
 	/* randomly generate the IV */
 	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	memset(ivbuf, 0, ZIO_DATA_IV_LEN);
 	return (ret);
 }
 
 int
 zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
     uint8_t *digestbuf, uint_t digestlen)
 {
 	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
 
 	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
 
 	crypto_mac(&key->zk_hmac_key, data, datalen,
 	    raw_digestbuf, SHA512_DIGEST_LENGTH);
 
 	memcpy(digestbuf, raw_digestbuf, digestlen);
 
 	return (0);
 }
 
 int
 zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
     uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
 {
 	int ret;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	ret = zio_crypt_do_hmac(key, data, datalen,
 	    digestbuf, SHA512_DIGEST_LENGTH);
 	if (ret != 0)
 		return (ret);
 
 	memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN);
 	memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN);
 
 	return (0);
 }
 
 /*
  * The following functions are used to encode and decode encryption parameters
  * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
  * byte strings, which normally means that these strings would not need to deal
  * with byteswapping at all. However, both blkptr_t and zil_header_t may be
  * byteswapped by lower layers and so we must "undo" that byteswap here upon
  * decoding and encoding in a non-native byteorder. These functions require
  * that the byteorder bit is correct before being called.
  */
 void
 zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_ENCRYPTED(bp));
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
 		memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, val32);
 	} else {
 		memcpy(&val64, salt, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
 
 		memcpy(&val64, iv, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
 
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, BSWAP_32(val32));
 	}
 }
 
 void
 zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_PROTECTED(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		memset(salt, 0, ZIO_DATA_SALT_LEN);
 		memset(iv, 0, ZIO_DATA_IV_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
 		memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
 
 		val32 = (uint32_t)BP_GET_IV2(bp);
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
 		memcpy(salt, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
 		memcpy(iv, &val64, sizeof (uint64_t));
 
 		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
 		memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
 		    sizeof (uint64_t));
 	} else {
 		memcpy(&val64, mac, sizeof (uint64_t));
 		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
 
 		memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
 		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
 	}
 }
 
 void
 zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
 		memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
 		    sizeof (uint64_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
 		memcpy(mac, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
 		memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
 {
 	zil_chain_t *zilc = data;
 
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
 	    sizeof (uint64_t));
 }
 
 void
 zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
 {
 	/*
 	 * The ZIL MAC is embedded in the block it protects, which will
 	 * not have been byteswapped by the time this function has been called.
 	 * As a result, we don't need to worry about byteswapping the MAC.
 	 */
 	const zil_chain_t *zilc = data;
 
 	memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
 	memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
 	    sizeof (uint64_t));
 }
 
 /*
  * This routine takes a block of dnodes (src_abd) and copies only the bonus
  * buffers to the same offsets in the dst buffer. datalen should be the size
  * of both the src_abd and the dst buffer (not just the length of the bonus
  * buffers).
  */
 void
 zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
 {
 	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
 	uint8_t *src;
 	dnode_phys_t *dnp, *sdnp, *ddnp;
 
 	src = abd_borrow_buf_copy(src_abd, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp),
 			    DN_MAX_BONUS_LEN(dnp));
 		}
 	}
 
 	abd_return_buf(src_abd, src, datalen);
 }
 
 /*
  * This function decides what fields from blk_prop are included in
  * the on-disk various MAC algorithms.
  */
 static void
 zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
 {
 	int avoidlint = SPA_MINBLOCKSIZE;
 	/*
 	 * Version 0 did not properly zero out all non-portable fields
 	 * as it should have done. We maintain this code so that we can
 	 * do read-only imports of pools on this version.
 	 */
 	if (version == 0) {
 		BP_SET_DEDUP(bp, 0);
 		BP_SET_CHECKSUM(bp, 0);
 		BP_SET_PSIZE(bp, avoidlint);
 		return;
 	}
 
 	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 
 	/*
 	 * The hole_birth feature might set these fields even if this bp
 	 * is a hole. We zero them out here to guarantee that raw sends
 	 * will function with or without the feature.
 	 */
 	if (BP_IS_HOLE(bp)) {
 		bp->blk_prop = 0ULL;
 		return;
 	}
 
 	/*
 	 * At L0 we want to verify these fields to ensure that data blocks
 	 * can not be reinterpreted. For instance, we do not want an attacker
 	 * to trick us into returning raw lz4 compressed data to the user
 	 * by modifying the compression bits. At higher levels, we cannot
 	 * enforce this policy since raw sends do not convey any information
 	 * about indirect blocks, so these values might be different on the
 	 * receive side. Fortunately, this does not open any new attack
 	 * vectors, since any alterations that can be made to a higher level
 	 * bp must still verify the correct order of the layer below it.
 	 */
 	if (BP_GET_LEVEL(bp) != 0) {
 		BP_SET_BYTEORDER(bp, 0);
 		BP_SET_COMPRESS(bp, 0);
 
 		/*
 		 * psize cannot be set to zero or it will trigger
 		 * asserts, but the value doesn't really matter as
 		 * long as it is constant.
 		 */
 		BP_SET_PSIZE(bp, avoidlint);
 	}
 
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_CHECKSUM(bp, 0);
 }
 
 static void
 zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
     blkptr_auth_buf_t *bab, uint_t *bab_len)
 {
 	blkptr_t tmpbp = *bp;
 
 	if (should_bswap)
 		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
 
 	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
 	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
 
 	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
 
 	/*
 	 * We always MAC blk_prop in LE to ensure portability. This
 	 * must be done after decoding the mac, since the endianness
 	 * will get zero'd out here.
 	 */
 	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
 	bab->bab_prop = LE_64(tmpbp.blk_prop);
 	bab->bab_pad = 0ULL;
 
 	/* version 0 did not include the padding */
 	*bab_len = sizeof (blkptr_auth_buf_t);
 	if (version == 0)
 		*bab_len -= sizeof (uint64_t);
 }
 
 static int
 zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	crypto_mac_update(ctx, &bab, bab_len);
 
 	return (0);
 }
 
 static void
 zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	SHA2Update(ctx, &bab, bab_len);
 }
 
 static void
 zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	memcpy(*aadp, &bab, bab_len);
 	*aadp += bab_len;
 	*aad_len += bab_len;
 }
 
 static int
 zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, dnode_phys_t *dnp)
 {
 	int ret, i;
 	dnode_phys_t *adnp;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)];
 
 	/* authenticate the core dnode (masking out non-portable bits) */
 	memcpy(tmp_dncore, dnp, sizeof (tmp_dncore));
 	adnp = (dnode_phys_t *)tmp_dncore;
 	if (le_bswap) {
 		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
 		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
 		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
 		adnp->dn_used = BSWAP_64(adnp->dn_used);
 	}
 	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 	adnp->dn_used = 0;
 
 	crypto_mac_update(ctx, adnp, sizeof (tmp_dncore));
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, &dnp->dn_blkptr[i]);
 		if (ret != 0)
 			goto error;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, DN_SPILL_BLKPTR(dnp));
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * objset_phys_t blocks introduce a number of exceptions to the normal
  * authentication process. objset_phys_t's contain 2 separate HMACS for
  * protecting the integrity of their data. The portable_mac protects the
  * metadnode. This MAC can be sent with a raw send and protects against
  * reordering of data within the metadnode. The local_mac protects the user
  * accounting objects which are not sent from one system to another.
  *
  * In addition, objset blocks are the only blocks that can be modified and
  * written to disk without the key loaded under certain circumstances. During
  * zil_claim() we need to be able to update the zil_header_t to complete
  * claiming log blocks and during raw receives we need to write out the
  * portable_mac from the send file. Both of these actions are possible
  * because these fields are not protected by either MAC so neither one will
  * need to modify the MACs without the key. However, when the modified blocks
  * are written out they will be byteswapped into the host machine's native
  * endianness which will modify fields protected by the MAC. As a result, MAC
  * calculation for objset blocks works slightly differently from other block
  * types. Where other block types MAC the data in whatever endianness is
  * written to disk, objset blocks always MAC little endian version of their
  * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
  * and le_bswap indicates whether a byteswap is needed to get this block
  * into little endian format.
  */
 int
 zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
     boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
 {
 	int ret;
 	struct hmac_ctx hash_ctx;
 	struct hmac_ctx *ctx = &hash_ctx;
 	objset_phys_t *osp = data;
 	uint64_t intval;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
 	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
 
 
 	/* calculate the portable MAC from the portable fields and metadnode */
 	crypto_mac_init(ctx, &key->zk_hmac_key);
 
 	/* add in the os_type */
 	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* add in the portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* add in fields from the metadnode */
 	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 	    should_bswap, &osp->os_meta_dnode);
 	if (ret)
 		goto error;
 
 	crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH);
 
 	memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN);
 
 	/*
 	 * This is necessary here as we check next whether
 	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to
 	 * decide if the local_mac should be zeroed out. That flag will always
 	 * be set by dmu_objset_id_quota_upgrade_cb() and
 	 * dmu_objset_userspace_upgrade_cb() if useraccounting has been
 	 * completed.
 	 */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	boolean_t uacct_incomplete =
 	    !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 
 	/*
 	 * The local MAC protects the user, group and project accounting.
 	 * If these objects are not present, the local MAC is zeroed out.
 	 */
 	if (uacct_incomplete ||
 	    (datalen >= OBJSET_PHYS_SIZE_V3 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
 		memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 		return (0);
 	}
 
 	/* calculate the local MAC from the userused and groupused dnodes */
 	crypto_mac_init(ctx, &key->zk_hmac_key);
 
 	/* add in the non-portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	crypto_mac_update(ctx, &intval, sizeof (uint64_t));
 
 	/* XXX check dnode type ... */
 	/* add in fields from the user accounting dnodes */
 	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_userused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_groupused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
 	    datalen >= OBJSET_PHYS_SIZE_V3) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_projectused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH);
 
 	memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN);
 
 	return (0);
 
 error:
 	memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN);
 	memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 	return (ret);
 }
 
 static void
 zio_crypt_destroy_uio(zfs_uio_t *uio)
 {
 	if (GET_UIO_STRUCT(uio)->uio_iov)
 		kmem_free(GET_UIO_STRUCT(uio)->uio_iov,
 		    zfs_uio_iovcnt(uio) * sizeof (iovec_t));
 }
 
 /*
  * This function parses an uncompressed indirect block and returns a checksum
  * of all the portable fields from all of the contained bps. The portable
  * fields are the MAC and all of the fields from blk_prop except for the dedup,
  * checksum, and psize bits. For an explanation of the purpose of this, see
  * the comment block on object set authentication.
  */
 static int
 zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
     uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
 {
 	blkptr_t *bp;
 	int i, epb = datalen >> SPA_BLKPTRSHIFT;
 	SHA2_CTX ctx;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	/* checksum all of the MACs from the layer below */
 	SHA2Init(SHA512, &ctx);
 	for (i = 0, bp = buf; i < epb; i++, bp++) {
 		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
 		    byteswap, bp);
 	}
 	SHA2Final(digestbuf, &ctx);
 
 	if (generate) {
 		memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
 	if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) {
 #ifdef FCRYPTO_DEBUG
 		printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__);
 #endif
 		return (SET_ERROR(ECKSUM));
 	}
 	return (0);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 
 	/*
 	 * Unfortunately, callers of this function will not always have
 	 * easy access to the on-disk format version. This info is
 	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
 	 * is expected to be verifiable even when the key isn't loaded.
 	 * Here, instead of doing a ZAP lookup for the version for each
 	 * zio, we simply try both existing formats.
 	 */
 	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
 	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
 	if (ret == ECKSUM) {
 		ASSERT(!generate);
 		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
 		    buf, datalen, 0, byteswap, cksum);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 	void *buf;
 
 	buf = abd_borrow_buf_copy(abd, datalen);
 	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
 	    byteswap, cksum);
 	abd_return_buf(abd, buf, datalen);
 
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting ZIL blocks.
  * We do not check for the older ZIL chain because the encryption feature
  * was not available before the newer ZIL chain was introduced. The goal
  * here is to encrypt everything except the blkptr_t of a lr_write_t and
  * the zil_chain_t header. Everything that is not encrypted is authenticated.
  */
 /*
  * The OpenCrypto used in FreeBSD does not use separate source and
  * destination buffers; instead, the same buffer is used.  Further, to
  * accommodate some of the drivers, the authbuf needs to be logically before
  * the data.  This means that we need to copy the source to the destination,
  * and set up an extra iovec_t at the beginning to handle the authbuf.
  * It also means we'll only return one zfs_uio_t.
  */
 
 static int
 zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
     zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
     boolean_t *no_crypt)
 {
 	(void) puio;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
 	iovec_t *dst_iovecs;
 	zil_chain_t *zilc;
 	lr_t *lr;
-	uint64_t txtype, lr_len;
+	uint64_t txtype, lr_len, nused;
 	uint_t crypt_len, nr_iovecs, vec;
 	uint_t aad_len = 0, total_len = 0;
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 
 	/* Find the start and end record of the log block. */
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
-	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	ASSERT3U(nused, >=, sizeof (zil_chain_t));
+	ASSERT3U(nused, <=, datalen);
+	blkend = src + nused;
 
 	/*
 	 * Calculate the number of encrypted iovecs we will need.
 	 */
 
 	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
 	nr_iovecs = 2;
 
 	for (; slrp < blkend; slrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (byteswap) {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		} else {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		}
+		ASSERT3U(lr_len, >=, sizeof (lr_t));
+		ASSERT3U(lr_len, <=, blkend - slrp);
 
 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
 			nr_iovecs++;
 	}
 
 	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
 
 	/*
 	 * Copy the plain zil header over and authenticate everything except
 	 * the checksum that will store our MAC. If we are writing the data
 	 * the embedded checksum will not have been calculated yet, so we don't
 	 * authenticate that.
 	 */
 	memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t));
 	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 
 	slrp = src + sizeof (zil_chain_t);
 	dlrp = dst + sizeof (zil_chain_t);
 
 	/*
 	 * Loop over records again, filling in iovecs.
 	 */
 
 	/* The first iovec will contain the authbuf. */
 	vec = 1;
 
 	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 
 		/* copy the common lr_t */
 		memcpy(dlrp, slrp, sizeof (lr_t));
 		memcpy(aadp, slrp, sizeof (lr_t));
 		aadp += sizeof (lr_t);
 		aad_len += sizeof (lr_t);
 
 		/*
 		 * If this is a TX_WRITE record we want to encrypt everything
 		 * except the bp if exists. If the bp does exist we want to
 		 * authenticate it.
 		 */
 		if (txtype == TX_WRITE) {
 			const size_t o = offsetof(lr_write_t, lr_blkptr);
 			crypt_len = o - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			/* copy the bp now since it will not be encrypted */
 			memcpy(dlrp + o, slrp + o, sizeof (blkptr_t));
 			memcpy(aadp, slrp + o, sizeof (blkptr_t));
 			aadp += sizeof (blkptr_t);
 			aad_len += sizeof (blkptr_t);
 			vec++;
 			total_len += crypt_len;
 
 			if (lr_len != sizeof (lr_write_t)) {
 				crypt_len = lr_len - sizeof (lr_write_t);
 				dst_iovecs[vec].iov_base = (char *)
 				    dlrp + sizeof (lr_write_t);
 				dst_iovecs[vec].iov_len = crypt_len;
 				vec++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			vec++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			dst_iovecs[vec].iov_base = (char *)dlrp + sizeof (lr_t);
 			dst_iovecs[vec].iov_len = crypt_len;
 			vec++;
 			total_len += crypt_len;
 		}
 	}
 
 	/* The last iovec will contain the MAC. */
 	ASSERT3U(vec, ==, nr_iovecs - 1);
 
 	/* AAD */
 	dst_iovecs[0].iov_base = aadbuf;
 	dst_iovecs[0].iov_len = aad_len;
 	/* MAC */
 	dst_iovecs[vec].iov_base = 0;
 	dst_iovecs[vec].iov_len = 0;
 
 	*no_crypt = (vec == 1);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_iovecs;
 
 	return (0);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting dnode blocks.
  */
 static int
 zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     zfs_uio_t *puio, zfs_uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf,
     uint_t *auth_len, boolean_t *no_crypt)
 {
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 	uint8_t *src, *dst, *aadp;
 	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
 	iovec_t *dst_iovecs;
 	uint_t nr_iovecs, crypt_len, vec;
 	uint_t aad_len = 0, total_len = 0;
 	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 	aadp = aadbuf;
 
 	/*
 	 * Count the number of iovecs we will need to do the encryption by
 	 * counting the number of bonus buffers that need to be encrypted.
 	 */
 
 	/* We need at least two iovecs -- one for the AAD, one for the MAC. */
 	nr_iovecs = 2;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		/*
 		 * This block may still be byteswapped. However, all of the
 		 * values we use are either uint8_t's (for which byteswapping
 		 * is a noop) or a * != 0 check, which will work regardless
 		 * of whether or not we byteswap.
 		 */
 		if (sdnp[i].dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
 		    sdnp[i].dn_bonuslen != 0) {
 			nr_iovecs++;
 		}
 	}
 
 	dst_iovecs = kmem_alloc(nr_iovecs * sizeof (iovec_t), KM_SLEEP);
 
 	/*
 	 * Iterate through the dnodes again, this time filling in the uios
 	 * we allocated earlier. We also concatenate any data we want to
 	 * authenticate onto aadbuf.
 	 */
 
 	/* The first iovec will contain the authbuf. */
 	vec = 1;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 
 		/* copy over the core fields and blkptrs (kept as plaintext) */
 		memcpy(&ddnp[i], dnp,
 		    (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp),
 			    sizeof (blkptr_t));
 		}
 
 		/*
 		 * Handle authenticated data. We authenticate everything in
 		 * the dnode that can be brought over when we do a raw send.
 		 * This includes all of the core fields as well as the MACs
 		 * stored in the bp checksums and all of the portable bits
 		 * from blk_prop. We include the dnode padding here in case it
 		 * ever gets used in the future. Some dn_flags and dn_used are
 		 * not portable so we mask those out values out of the
 		 * authenticated data.
 		 */
 		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
 		memcpy(aadp, dnp, crypt_len);
 		adnp = (dnode_phys_t *)aadp;
 		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 		adnp->dn_used = 0;
 		aadp += crypt_len;
 		aad_len += crypt_len;
 
 		for (j = 0; j < dnp->dn_nblkptr; j++) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, &dnp->dn_blkptr[j]);
 		}
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, DN_SPILL_BLKPTR(dnp));
 		}
 
 		/*
 		 * If this bonus buffer needs to be encrypted, we prepare an
 		 * iovec_t. The encryption / decryption functions will fill
 		 * this in for us with the encrypted or decrypted data.
 		 * Otherwise we add the bonus buffer to the authenticated
 		 * data buffer and copy it over to the destination. The
 		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
 		 * we can guarantee alignment with the AES block size
 		 * (128 bits).
 		 */
 		crypt_len = DN_MAX_BONUS_LEN(dnp);
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			dst_iovecs[vec].iov_base = DN_BONUS(&ddnp[i]);
 			dst_iovecs[vec].iov_len = crypt_len;
 
 			vec++;
 			total_len += crypt_len;
 		} else {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len);
 			memcpy(aadp, DN_BONUS(dnp), crypt_len);
 			aadp += crypt_len;
 			aad_len += crypt_len;
 		}
 	}
 
 	/* The last iovec will contain the MAC. */
 	ASSERT3U(vec, ==, nr_iovecs - 1);
 
 	/* AAD */
 	dst_iovecs[0].iov_base = aadbuf;
 	dst_iovecs[0].iov_len = aad_len;
 	/* MAC */
 	dst_iovecs[vec].iov_base = 0;
 	dst_iovecs[vec].iov_len = 0;
 
 	*no_crypt = (vec == 1);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 	GET_UIO_STRUCT(out_uio)->uio_iov = dst_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_iovecs;
 
 	return (0);
 }
 
 static int
 zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *out_uio,
     uint_t *enc_len)
 {
 	(void) puio;
 	int ret;
 	uint_t nr_plain = 1, nr_cipher = 2;
 	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
 	void *src, *dst;
 
 	cipher_iovecs = kmem_zalloc(nr_cipher * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!cipher_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 	}
 	memcpy(dst, src, datalen);
 	cipher_iovecs[0].iov_base = dst;
 	cipher_iovecs[0].iov_len = datalen;
 
 	*enc_len = datalen;
 	GET_UIO_STRUCT(out_uio)->uio_iov = cipher_iovecs;
 	zfs_uio_iovcnt(out_uio) = nr_cipher;
 
 	return (0);
 
 error:
 	if (plain_iovecs != NULL)
 		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
 	if (cipher_iovecs != NULL)
 		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
 
 	*enc_len = 0;
 	GET_UIO_STRUCT(out_uio)->uio_iov = NULL;
 	zfs_uio_iovcnt(out_uio) = 0;
 
 	return (ret);
 }
 
 /*
  * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
  * that they can be used for encryption and decryption by zio_do_crypt_uio().
  * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
  * requiring special handling to parse out pieces that are to be encrypted. The
  * authbuf is used by these special cases to store additional authenticated
  * data (AAD) for the encryption modes.
  */
 static int
 zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
     uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	iovec_t *mac_iov;
 
 	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
 
 	/* route to handler */
 	switch (ot) {
 	case DMU_OT_INTENT_LOG:
 		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
 		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
 		    no_crypt);
 		break;
 	case DMU_OT_DNODE:
 		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
 		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
 		    auth_len, no_crypt);
 		break;
 	default:
 		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
 		    datalen, puio, cuio, enc_len);
 		*authbuf = NULL;
 		*auth_len = 0;
 		*no_crypt = B_FALSE;
 		break;
 	}
 
 	if (ret != 0)
 		goto error;
 
 	/* populate the uios */
 	zfs_uio_segflg(cuio) = UIO_SYSSPACE;
 
 	mac_iov =
 	    ((iovec_t *)&(GET_UIO_STRUCT(cuio)->
 	    uio_iov[zfs_uio_iovcnt(cuio) - 1]));
 	mac_iov->iov_base = (void *)mac;
 	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 void *failed_decrypt_buf;
 int faile_decrypt_size;
 
 /*
  * Primary encryption / decryption entrypoint for zio data.
  */
 int
 zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
     dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
     uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
     boolean_t *no_crypt)
 {
 	int ret;
 	boolean_t locked = B_FALSE;
 	uint64_t crypt = key->zk_crypt;
 	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
 	uint_t enc_len, auth_len;
 	zfs_uio_t puio, cuio;
 	struct uio puio_s, cuio_s;
 	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
 	crypto_key_t tmp_ckey, *ckey = NULL;
 	freebsd_crypt_session_t *tmpl = NULL;
 	uint8_t *authbuf = NULL;
 
 
 	zfs_uio_init(&puio, &puio_s);
 	zfs_uio_init(&cuio, &cuio_s);
 	memset(GET_UIO_STRUCT(&puio), 0, sizeof (struct uio));
 	memset(GET_UIO_STRUCT(&cuio), 0, sizeof (struct uio));
 
 #ifdef FCRYPTO_DEBUG
 	printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n",
 	    __FUNCTION__,
 	    encrypt ? "encrypt" : "decrypt",
 	    key, salt, ot, iv, mac, datalen,
 	    byteswap ? "byteswap" : "native_endian", plainbuf,
 	    cipherbuf, no_crypt);
 
 	printf("\tkey = {");
 	for (int i = 0; i < key->zk_current_key.ck_length/8; i++)
 		printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]);
 	printf("}\n");
 #endif
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
 	    &authbuf, &auth_len, no_crypt);
 	if (ret != 0)
 		return (ret);
 
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
 	 * If we are encrypting, we must return a copy of the current salt
 	 * so that it can be stored in the blkptr_t.
 	 */
 	rw_enter(&key->zk_salt_lock, RW_READER);
 	locked = B_TRUE;
 
 	if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
 		ckey = &key->zk_current_key;
 		tmpl = &key->zk_session;
 	} else {
 		rw_exit(&key->zk_salt_lock);
 		locked = B_FALSE;
 
 		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
 		if (ret != 0)
 			goto error;
 		tmp_ckey.ck_data = enc_keydata;
 		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 		ckey = &tmp_ckey;
 		tmpl = NULL;
 	}
 
 	/* perform the encryption / decryption */
 	ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt,
 	    ckey, iv, enc_len, &cuio, auth_len);
 	if (ret != 0)
 		goto error;
 	if (locked) {
 		rw_exit(&key->zk_salt_lock);
 	}
 
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (0);
 
 error:
 	if (!encrypt) {
 		if (failed_decrypt_buf != NULL)
 			kmem_free(failed_decrypt_buf, failed_decrypt_size);
 		failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP);
 		failed_decrypt_size = datalen;
 		memcpy(failed_decrypt_buf, cipherbuf, datalen);
 	}
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 	return (SET_ERROR(ret));
 }
 
 /*
  * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
  * linear buffers.
  */
 int
 zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
     boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
     uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
 {
 	int ret;
 	void *ptmp, *ctmp;
 
 	if (encrypt) {
 		ptmp = abd_borrow_buf_copy(pabd, datalen);
 		ctmp = abd_borrow_buf(cabd, datalen);
 	} else {
 		ptmp = abd_borrow_buf(pabd, datalen);
 		ctmp = abd_borrow_buf_copy(cabd, datalen);
 	}
 
 	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
 	    datalen, ptmp, ctmp, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (0);
 
 error:
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (SET_ERROR(ret));
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPL)
 /* CSTYLED */
 module_param(zfs_key_max_salt_uses, ulong, 0644);
 MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
 	"can be used for generating encryption keys before it is rotated");
 #endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
index 55f807ccfc13..21f3740f6fe6 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zio_crypt.c
@@ -1,2075 +1,2080 @@
 /*
  * CDDL HEADER START
  *
  * This file and its contents are supplied under the terms of the
  * Common Development and Distribution License ("CDDL"), version 1.0.
  * You may only use this file in accordance with the terms of version
  * 1.0 of the CDDL.
  *
  * A full copy of the text of the CDDL should have accompanied this
  * source.  A copy of the CDDL is also available via the Internet at
  * http://www.illumos.org/license/CDDL.
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2017, Datto, Inc. All rights reserved.
  */
 
 #include <sys/zio_crypt.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dnode.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sha2.h>
 #include <sys/hkdf.h>
 #include <sys/qat.h>
 
 /*
  * This file is responsible for handling all of the details of generating
  * encryption parameters and performing encryption and authentication.
  *
  * BLOCK ENCRYPTION PARAMETERS:
  * Encryption /Authentication Algorithm Suite (crypt):
  * The encryption algorithm, mode, and key length we are going to use. We
  * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit
  * keys. All authentication is currently done with SHA512-HMAC.
  *
  * Plaintext:
  * The unencrypted data that we want to encrypt.
  *
  * Initialization Vector (IV):
  * An initialization vector for the encryption algorithms. This is used to
  * "tweak" the encryption algorithms so that two blocks of the same data are
  * encrypted into different ciphertext outputs, thus obfuscating block patterns.
  * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is
  * never reused with the same encryption key. This value is stored unencrypted
  * and must simply be provided to the decryption function. We use a 96 bit IV
  * (as recommended by NIST) for all block encryption. For non-dedup blocks we
  * derive the IV randomly. The first 64 bits of the IV are stored in the second
  * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of
  * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits
  * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count
  * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of
  * level 0 blocks is the number of allocated dnodes in that block. The on-disk
  * format supports at most 2^15 slots per L0 dnode block, because the maximum
  * block size is 16MB (2^24). In either case, for level 0 blocks this number
  * will still be smaller than UINT32_MAX so it is safe to store the IV in the
  * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count
  * for the dnode code.
  *
  * Master key:
  * This is the most important secret data of an encrypted dataset. It is used
  * along with the salt to generate that actual encryption keys via HKDF. We
  * do not use the master key to directly encrypt any data because there are
  * theoretical limits on how much data can actually be safely encrypted with
  * any encryption mode. The master key is stored encrypted on disk with the
  * user's wrapping key. Its length is determined by the encryption algorithm.
  * For details on how this is stored see the block comment in dsl_crypt.c
  *
  * Salt:
  * Used as an input to the HKDF function, along with the master key. We use a
  * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt
  * can be used for encrypting many blocks, so we cache the current salt and the
  * associated derived key in zio_crypt_t so we do not need to derive it again
  * needlessly.
  *
  * Encryption Key:
  * A secret binary key, generated from an HKDF function used to encrypt and
  * decrypt data.
  *
  * Message Authentication Code (MAC)
  * The MAC is an output of authenticated encryption modes such as AES-GCM and
  * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted
  * data on disk and return garbage to the application. Effectively, it is a
  * checksum that can not be reproduced by an attacker. We store the MAC in the
  * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated
  * regular checksum of the ciphertext which can be used for scrubbing.
  *
  * OBJECT AUTHENTICATION:
  * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because
  * they contain some info that always needs to be readable. To prevent this
  * data from being altered, we authenticate this data using SHA512-HMAC. This
  * will produce a MAC (similar to the one produced via encryption) which can
  * be used to verify the object was not modified. HMACs do not require key
  * rotation or IVs, so we can keep up to the full 3 copies of authenticated
  * data.
  *
  * ZIL ENCRYPTION:
  * ZIL blocks have their bp written to disk ahead of the associated data, so we
  * cannot store the MAC there as we normally do. For these blocks the MAC is
  * stored in the embedded checksum within the zil_chain_t header. The salt and
  * IV are generated for the block on bp allocation instead of at encryption
  * time. In addition, ZIL blocks have some pieces that must be left in plaintext
  * for claiming even though all of the sensitive user data still needs to be
  * encrypted. The function zio_crypt_init_uios_zil() handles parsing which
  * pieces of the block need to be encrypted. All data that is not encrypted is
  * authenticated using the AAD mechanisms that the supported encryption modes
  * provide for. In order to preserve the semantics of the ZIL for encrypted
  * datasets, the ZIL is not protected at the objset level as described below.
  *
  * DNODE ENCRYPTION:
  * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left
  * in plaintext for scrubbing and claiming, but the bonus buffers might contain
  * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing
  * which pieces of the block need to be encrypted. For more details about
  * dnode authentication and encryption, see zio_crypt_init_uios_dnode().
  *
  * OBJECT SET AUTHENTICATION:
  * Up to this point, everything we have encrypted and authenticated has been
  * at level 0 (or -2 for the ZIL). If we did not do any further work the
  * on-disk format would be susceptible to attacks that deleted or rearranged
  * the order of level 0 blocks. Ideally, the cleanest solution would be to
  * maintain a tree of authentication MACs going up the bp tree. However, this
  * presents a problem for raw sends. Send files do not send information about
  * indirect blocks so there would be no convenient way to transfer the MACs and
  * they cannot be recalculated on the receive side without the master key which
  * would defeat one of the purposes of raw sends in the first place. Instead,
  * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs
  * from the level below. We also include some portable fields from blk_prop such
  * as the lsize and compression algorithm to prevent the data from being
  * misinterpreted.
  *
  * At the objset level, we maintain 2 separate 256 bit MACs in the
  * objset_phys_t. The first one is "portable" and is the logical root of the
  * MAC tree maintained in the metadnode's bps. The second, is "local" and is
  * used as the root MAC for the user accounting objects, which are also not
  * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload
  * of the send file. The useraccounting code ensures that the useraccounting
  * info is not present upon a receive, so the local MAC can simply be cleared
  * out at that time. For more info about objset_phys_t authentication, see
  * zio_crypt_do_objset_hmacs().
  *
  * CONSIDERATIONS FOR DEDUP:
  * In order for dedup to work, blocks that we want to dedup with one another
  * need to use the same IV and encryption key, so that they will have the same
  * ciphertext. Normally, one should never reuse an IV with the same encryption
  * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both
  * blocks. In this case, however, since we are using the same plaintext as
  * well all that we end up with is a duplicate of the original ciphertext we
  * already had. As a result, an attacker with read access to the raw disk will
  * be able to tell which blocks are the same but this information is given away
  * by dedup anyway. In order to get the same IVs and encryption keys for
  * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC
  * here so that a reproducible checksum of the plaintext is never available to
  * the attacker. The HMAC key is kept alongside the master key, encrypted on
  * disk. The first 64 bits of the HMAC are used in place of the random salt, and
  * the next 96 bits are used as the IV. As a result of this mechanism, dedup
  * will only work within a clone family since encrypted dedup requires use of
  * the same master and HMAC keys.
  */
 
 /*
  * After encrypting many blocks with the same key we may start to run up
  * against the theoretical limits of how much data can securely be encrypted
  * with a single key using the supported encryption modes. The most obvious
  * limitation is that our risk of generating 2 equivalent 96 bit IVs increases
  * the more IVs we generate (which both GCM and CCM modes strictly forbid).
  * This risk actually grows surprisingly quickly over time according to the
  * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have
  * generated n IVs with a cryptographically secure RNG, the approximate
  * probability p(n) of a collision is given as:
  *
  * p(n) ~= e^(-n*(n-1)/(2*(2^96)))
  *
  * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html]
  *
  * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion
  * we must not write more than 398,065,730 blocks with the same encryption key.
  * Therefore, we rotate our keys after 400,000,000 blocks have been written by
  * generating a new random 64 bit salt for our HKDF encryption key generation
  * function.
  */
 #define	ZFS_KEY_MAX_SALT_USES_DEFAULT	400000000
 #define	ZFS_CURRENT_MAX_SALT_USES	\
 	(MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT))
 static unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT;
 
 typedef struct blkptr_auth_buf {
 	uint64_t bab_prop;			/* blk_prop - portable mask */
 	uint8_t bab_mac[ZIO_DATA_MAC_LEN];	/* MAC from blk_cksum */
 	uint64_t bab_pad;			/* reserved for future use */
 } blkptr_auth_buf_t;
 
 const zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = {
 	{"",			ZC_TYPE_NONE,	0,	"inherit"},
 	{"",			ZC_TYPE_NONE,	0,	"on"},
 	{"",			ZC_TYPE_NONE,	0,	"off"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	16,	"aes-128-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	24,	"aes-192-ccm"},
 	{SUN_CKM_AES_CCM,	ZC_TYPE_CCM,	32,	"aes-256-ccm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	16,	"aes-128-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	24,	"aes-192-gcm"},
 	{SUN_CKM_AES_GCM,	ZC_TYPE_GCM,	32,	"aes-256-gcm"}
 };
 
 void
 zio_crypt_key_destroy(zio_crypt_key_t *key)
 {
 	rw_destroy(&key->zk_salt_lock);
 
 	/* free crypto templates */
 	crypto_destroy_ctx_template(key->zk_current_tmpl);
 	crypto_destroy_ctx_template(key->zk_hmac_tmpl);
 
 	/* zero out sensitive data */
 	memset(key, 0, sizeof (zio_crypt_key_t));
 }
 
 int
 zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key)
 {
 	int ret;
 	crypto_mechanism_t mech = {0};
 	uint_t keydata_len;
 
 	ASSERT(key != NULL);
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 /*
  * Workaround for GCC 12+ with UBSan enabled deficencies.
  *
  * GCC 12+ invoked with -fsanitize=undefined incorrectly reports the code
  * below as violating -Warray-bounds
  */
 #if defined(__GNUC__) && !defined(__clang__) && \
 	((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
 	    defined(CONFIG_UBSAN))
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Warray-bounds"
 #endif
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 #if defined(__GNUC__) && !defined(__clang__) && \
 	((!defined(_KERNEL) && defined(ZFS_UBSAN_ENABLED)) || \
 	    defined(CONFIG_UBSAN))
 #pragma GCC diagnostic pop
 #endif
 	memset(key, 0, sizeof (zio_crypt_key_t));
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	/* fill keydata buffers and salt with random data */
 	ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t));
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_master_keydata, keydata_len);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN);
 	if (ret != 0)
 		goto error;
 
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for the ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = &key->zk_hmac_key;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	/*
 	 * Initialize the crypto templates. It's ok if this fails because
 	 * this is just an optimization.
 	 */
 	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
 	    &key->zk_hmac_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_hmac_tmpl = NULL;
 
 	key->zk_crypt = crypt;
 	key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy(key);
 	return (ret);
 }
 
 static int
 zio_crypt_key_change_salt(zio_crypt_key_t *key)
 {
 	int ret = 0;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	crypto_mechanism_t mech;
 	uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen;
 
 	/* generate a new salt */
 	ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	rw_enter(&key->zk_salt_lock, RW_WRITER);
 
 	/* someone beat us to the salt rotation, just unlock and return */
 	if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES)
 		goto out_unlock;
 
 	/* derive the current key from the master key and the new salt */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len);
 	if (ret != 0)
 		goto out_unlock;
 
 	/* assign the salt and reset the usage count */
 	memcpy(key->zk_salt, salt, ZIO_DATA_SALT_LEN);
 	key->zk_salt_count = 0;
 
 	/* destroy the old context template and create the new one */
 	crypto_destroy_ctx_template(key->zk_current_tmpl);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	rw_exit(&key->zk_salt_lock);
 
 	return (0);
 
 out_unlock:
 	rw_exit(&key->zk_salt_lock);
 error:
 	return (ret);
 }
 
 /* See comment above zfs_key_max_salt_uses definition for details */
 int
 zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt)
 {
 	int ret;
 	boolean_t salt_change;
 
 	rw_enter(&key->zk_salt_lock, RW_READER);
 
 	memcpy(salt, key->zk_salt, ZIO_DATA_SALT_LEN);
 	salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >=
 	    ZFS_CURRENT_MAX_SALT_USES);
 
 	rw_exit(&key->zk_salt_lock);
 
 	if (salt_change) {
 		ret = zio_crypt_key_change_salt(key);
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * This function handles all encryption and decryption in zfs. When
  * encrypting it expects puio to reference the plaintext and cuio to
  * reference the ciphertext. cuio must have enough space for the
  * ciphertext + room for a MAC. datalen should be the length of the
  * plaintext / ciphertext alone.
  */
 static int
 zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key,
     crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen,
     zfs_uio_t *puio, zfs_uio_t *cuio, uint8_t *authbuf, uint_t auth_len)
 {
 	int ret;
 	crypto_data_t plaindata, cipherdata;
 	CK_AES_CCM_PARAMS ccmp;
 	CK_AES_GCM_PARAMS gcmp;
 	crypto_mechanism_t mech;
 	zio_crypt_info_t crypt_info;
 	uint_t plain_full_len, maclen;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	/* lookup the encryption info */
 	crypt_info = zio_crypt_table[crypt];
 
 	/* the mac will always be the last iovec_t in the cipher uio */
 	maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len;
 
 	ASSERT(maclen <= ZIO_DATA_MAC_LEN);
 
 	/* setup encryption mechanism (same as crypt) */
 	mech.cm_type = crypto_mech2id(crypt_info.ci_mechname);
 
 	/*
 	 * Strangely, the ICP requires that plain_full_len must include
 	 * the MAC length when decrypting, even though the UIO does not
 	 * need to have the extra space allocated.
 	 */
 	if (encrypt) {
 		plain_full_len = datalen;
 	} else {
 		plain_full_len = datalen + maclen;
 	}
 
 	/*
 	 * setup encryption params (currently only AES CCM and AES GCM
 	 * are supported)
 	 */
 	if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) {
 		ccmp.ulNonceSize = ZIO_DATA_IV_LEN;
 		ccmp.ulAuthDataSize = auth_len;
 		ccmp.authData = authbuf;
 		ccmp.ulMACSize = maclen;
 		ccmp.nonce = ivbuf;
 		ccmp.ulDataSize = plain_full_len;
 
 		mech.cm_param = (char *)(&ccmp);
 		mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS);
 	} else {
 		gcmp.ulIvLen = ZIO_DATA_IV_LEN;
 		gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN);
 		gcmp.ulAADLen = auth_len;
 		gcmp.pAAD = authbuf;
 		gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen);
 		gcmp.pIv = ivbuf;
 
 		mech.cm_param = (char *)(&gcmp);
 		mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS);
 	}
 
 	/* populate the cipher and plain data structs. */
 	plaindata.cd_format = CRYPTO_DATA_UIO;
 	plaindata.cd_offset = 0;
 	plaindata.cd_uio = puio;
 	plaindata.cd_length = plain_full_len;
 
 	cipherdata.cd_format = CRYPTO_DATA_UIO;
 	cipherdata.cd_offset = 0;
 	cipherdata.cd_uio = cuio;
 	cipherdata.cd_length = datalen + maclen;
 
 	/* perform the actual encryption */
 	if (encrypt) {
 		ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata);
 		if (ret != CRYPTO_SUCCESS) {
 			ret = SET_ERROR(EIO);
 			goto error;
 		}
 	} else {
 		ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata);
 		if (ret != CRYPTO_SUCCESS) {
 			ASSERT3U(ret, ==, CRYPTO_INVALID_MAC);
 			ret = SET_ERROR(ECKSUM);
 			goto error;
 		}
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv,
     uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out)
 {
 	int ret;
 	zfs_uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint64_t crypt = key->zk_crypt;
 	uint_t enc_len, keydata_len, aad_len;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* generate iv for wrapping the master and hmac key */
 	ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* initialize zfs_uio_ts */
 	plain_iovecs[0].iov_base = key->zk_master_keydata;
 	plain_iovecs[0].iov_len = keydata_len;
 	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
 	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 
 	cipher_iovecs[0].iov_base = keydata_out;
 	cipher_iovecs[0].iov_len = keydata_len;
 	cipher_iovecs[1].iov_base = hmac_keydata_out;
 	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 	cipher_iovecs[2].iov_base = mac;
 	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
 
 	/*
 	 * Although we don't support writing to the old format, we do
 	 * support rewrapping the key so that the user can move and
 	 * quarantine datasets on the old format.
 	 */
 	if (key->zk_version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(key->zk_guid);
 	} else {
 		ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(key->zk_guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(key->zk_version);
 	}
 
 	enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN;
 	puio.uio_iov = plain_iovecs;
 	puio.uio_iovcnt = 2;
 	puio.uio_segflg = UIO_SYSSPACE;
 	cuio.uio_iov = cipher_iovecs;
 	cuio.uio_iovcnt = 3;
 	cuio.uio_segflg = UIO_SYSSPACE;
 
 	/* encrypt the keys and store the resulting ciphertext and mac */
 	ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len,
 	    &puio, &cuio, (uint8_t *)aad, aad_len);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 int
 zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version,
     uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv,
     uint8_t *mac, zio_crypt_key_t *key)
 {
 	crypto_mechanism_t mech;
 	zfs_uio_t puio, cuio;
 	uint64_t aad[3];
 	iovec_t plain_iovecs[2], cipher_iovecs[3];
 	uint_t enc_len, keydata_len, aad_len;
 	int ret;
 
 	ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS);
 
 	rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL);
 
 	keydata_len = zio_crypt_table[crypt].ci_keylen;
 
 	/* initialize zfs_uio_ts */
 	plain_iovecs[0].iov_base = key->zk_master_keydata;
 	plain_iovecs[0].iov_len = keydata_len;
 	plain_iovecs[1].iov_base = key->zk_hmac_keydata;
 	plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 
 	cipher_iovecs[0].iov_base = keydata;
 	cipher_iovecs[0].iov_len = keydata_len;
 	cipher_iovecs[1].iov_base = hmac_keydata;
 	cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN;
 	cipher_iovecs[2].iov_base = mac;
 	cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN;
 
 	if (version == 0) {
 		aad_len = sizeof (uint64_t);
 		aad[0] = LE_64(guid);
 	} else {
 		ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 		aad_len = sizeof (uint64_t) * 3;
 		aad[0] = LE_64(guid);
 		aad[1] = LE_64(crypt);
 		aad[2] = LE_64(version);
 	}
 
 	enc_len = keydata_len + SHA512_HMAC_KEYLEN;
 	puio.uio_iov = plain_iovecs;
 	puio.uio_segflg = UIO_SYSSPACE;
 	puio.uio_iovcnt = 2;
 	cuio.uio_iov = cipher_iovecs;
 	cuio.uio_iovcnt = 3;
 	cuio.uio_segflg = UIO_SYSSPACE;
 
 	/* decrypt the keys and store the result in the output buffers */
 	ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len,
 	    &puio, &cuio, (uint8_t *)aad, aad_len);
 	if (ret != 0)
 		goto error;
 
 	/* generate a fresh salt */
 	ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN);
 	if (ret != 0)
 		goto error;
 
 	/* derive the current key from the master key */
 	ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 	    key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata,
 	    keydata_len);
 	if (ret != 0)
 		goto error;
 
 	/* initialize keys for ICP */
 	key->zk_current_key.ck_data = key->zk_current_keydata;
 	key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 	key->zk_hmac_key.ck_data = key->zk_hmac_keydata;
 	key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN);
 
 	/*
 	 * Initialize the crypto templates. It's ok if this fails because
 	 * this is just an optimization.
 	 */
 	mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname);
 	ret = crypto_create_ctx_template(&mech, &key->zk_current_key,
 	    &key->zk_current_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_current_tmpl = NULL;
 
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key,
 	    &key->zk_hmac_tmpl);
 	if (ret != CRYPTO_SUCCESS)
 		key->zk_hmac_tmpl = NULL;
 
 	key->zk_crypt = crypt;
 	key->zk_version = version;
 	key->zk_guid = guid;
 	key->zk_salt_count = 0;
 
 	return (0);
 
 error:
 	zio_crypt_key_destroy(key);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv(uint8_t *ivbuf)
 {
 	int ret;
 
 	/* randomly generate the IV */
 	ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN);
 	if (ret != 0)
 		goto error;
 
 	return (0);
 
 error:
 	memset(ivbuf, 0, ZIO_DATA_IV_LEN);
 	return (ret);
 }
 
 int
 zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen,
     uint8_t *digestbuf, uint_t digestlen)
 {
 	int ret;
 	crypto_mechanism_t mech;
 	crypto_data_t in_data, digest_data;
 	uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH];
 
 	ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH);
 
 	/* initialize sha512-hmac mechanism and crypto data */
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	mech.cm_param = NULL;
 	mech.cm_param_len = 0;
 
 	/* initialize the crypto data */
 	in_data.cd_format = CRYPTO_DATA_RAW;
 	in_data.cd_offset = 0;
 	in_data.cd_length = datalen;
 	in_data.cd_raw.iov_base = (char *)data;
 	in_data.cd_raw.iov_len = in_data.cd_length;
 
 	digest_data.cd_format = CRYPTO_DATA_RAW;
 	digest_data.cd_offset = 0;
 	digest_data.cd_length = SHA512_DIGEST_LENGTH;
 	digest_data.cd_raw.iov_base = (char *)raw_digestbuf;
 	digest_data.cd_raw.iov_len = digest_data.cd_length;
 
 	/* generate the hmac */
 	ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl,
 	    &digest_data);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(digestbuf, raw_digestbuf, digestlen);
 
 	return (0);
 
 error:
 	memset(digestbuf, 0, digestlen);
 	return (ret);
 }
 
 int
 zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data,
     uint_t datalen, uint8_t *ivbuf, uint8_t *salt)
 {
 	int ret;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	ret = zio_crypt_do_hmac(key, data, datalen,
 	    digestbuf, SHA512_DIGEST_LENGTH);
 	if (ret != 0)
 		return (ret);
 
 	memcpy(salt, digestbuf, ZIO_DATA_SALT_LEN);
 	memcpy(ivbuf, digestbuf + ZIO_DATA_SALT_LEN, ZIO_DATA_IV_LEN);
 
 	return (0);
 }
 
 /*
  * The following functions are used to encode and decode encryption parameters
  * into blkptr_t and zil_header_t. The ICP wants to use these parameters as
  * byte strings, which normally means that these strings would not need to deal
  * with byteswapping at all. However, both blkptr_t and zil_header_t may be
  * byteswapped by lower layers and so we must "undo" that byteswap here upon
  * decoding and encoding in a non-native byteorder. These functions require
  * that the byteorder bit is correct before being called.
  */
 void
 zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_ENCRYPTED(bp));
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t));
 		memcpy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t));
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, val32);
 	} else {
 		memcpy(&val64, salt, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[0] = BSWAP_64(val64);
 
 		memcpy(&val64, iv, sizeof (uint64_t));
 		bp->blk_dva[2].dva_word[1] = BSWAP_64(val64);
 
 		memcpy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t));
 		BP_SET_IV2(bp, BSWAP_32(val32));
 	}
 }
 
 void
 zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv)
 {
 	uint64_t val64;
 	uint32_t val32;
 
 	ASSERT(BP_IS_PROTECTED(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_IS_AUTHENTICATED(bp)) {
 		memset(salt, 0, ZIO_DATA_SALT_LEN);
 		memset(iv, 0, ZIO_DATA_IV_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t));
 		memcpy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t));
 
 		val32 = (uint32_t)BP_GET_IV2(bp);
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]);
 		memcpy(salt, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]);
 		memcpy(iv, &val64, sizeof (uint64_t));
 
 		val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp));
 		memcpy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp));
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET);
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t));
 		memcpy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t),
 		    sizeof (uint64_t));
 	} else {
 		memcpy(&val64, mac, sizeof (uint64_t));
 		bp->blk_cksum.zc_word[2] = BSWAP_64(val64);
 
 		memcpy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t));
 		bp->blk_cksum.zc_word[3] = BSWAP_64(val64);
 	}
 }
 
 void
 zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac)
 {
 	uint64_t val64;
 
 	ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp));
 
 	/* for convenience, so callers don't need to check */
 	if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		memset(mac, 0, ZIO_DATA_MAC_LEN);
 		return;
 	}
 
 	if (!BP_SHOULD_BYTESWAP(bp)) {
 		memcpy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t));
 		memcpy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3],
 		    sizeof (uint64_t));
 	} else {
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[2]);
 		memcpy(mac, &val64, sizeof (uint64_t));
 
 		val64 = BSWAP_64(bp->blk_cksum.zc_word[3]);
 		memcpy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t));
 	}
 }
 
 void
 zio_crypt_encode_mac_zil(void *data, uint8_t *mac)
 {
 	zil_chain_t *zilc = data;
 
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t));
 	memcpy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t),
 	    sizeof (uint64_t));
 }
 
 void
 zio_crypt_decode_mac_zil(const void *data, uint8_t *mac)
 {
 	/*
 	 * The ZIL MAC is embedded in the block it protects, which will
 	 * not have been byteswapped by the time this function has been called.
 	 * As a result, we don't need to worry about byteswapping the MAC.
 	 */
 	const zil_chain_t *zilc = data;
 
 	memcpy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t));
 	memcpy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3],
 	    sizeof (uint64_t));
 }
 
 /*
  * This routine takes a block of dnodes (src_abd) and copies only the bonus
  * buffers to the same offsets in the dst buffer. datalen should be the size
  * of both the src_abd and the dst buffer (not just the length of the bonus
  * buffers).
  */
 void
 zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen)
 {
 	uint_t i, max_dnp = datalen >> DNODE_SHIFT;
 	uint8_t *src;
 	dnode_phys_t *dnp, *sdnp, *ddnp;
 
 	src = abd_borrow_buf_copy(src_abd, datalen);
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp),
 			    DN_MAX_BONUS_LEN(dnp));
 		}
 	}
 
 	abd_return_buf(src_abd, src, datalen);
 }
 
 /*
  * This function decides what fields from blk_prop are included in
  * the on-disk various MAC algorithms.
  */
 static void
 zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version)
 {
 	/*
 	 * Version 0 did not properly zero out all non-portable fields
 	 * as it should have done. We maintain this code so that we can
 	 * do read-only imports of pools on this version.
 	 */
 	if (version == 0) {
 		BP_SET_DEDUP(bp, 0);
 		BP_SET_CHECKSUM(bp, 0);
 		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
 		return;
 	}
 
 	ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION);
 
 	/*
 	 * The hole_birth feature might set these fields even if this bp
 	 * is a hole. We zero them out here to guarantee that raw sends
 	 * will function with or without the feature.
 	 */
 	if (BP_IS_HOLE(bp)) {
 		bp->blk_prop = 0ULL;
 		return;
 	}
 
 	/*
 	 * At L0 we want to verify these fields to ensure that data blocks
 	 * can not be reinterpreted. For instance, we do not want an attacker
 	 * to trick us into returning raw lz4 compressed data to the user
 	 * by modifying the compression bits. At higher levels, we cannot
 	 * enforce this policy since raw sends do not convey any information
 	 * about indirect blocks, so these values might be different on the
 	 * receive side. Fortunately, this does not open any new attack
 	 * vectors, since any alterations that can be made to a higher level
 	 * bp must still verify the correct order of the layer below it.
 	 */
 	if (BP_GET_LEVEL(bp) != 0) {
 		BP_SET_BYTEORDER(bp, 0);
 		BP_SET_COMPRESS(bp, 0);
 
 		/*
 		 * psize cannot be set to zero or it will trigger
 		 * asserts, but the value doesn't really matter as
 		 * long as it is constant.
 		 */
 		BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE);
 	}
 
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_CHECKSUM(bp, 0);
 }
 
 static void
 zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp,
     blkptr_auth_buf_t *bab, uint_t *bab_len)
 {
 	blkptr_t tmpbp = *bp;
 
 	if (should_bswap)
 		byteswap_uint64_array(&tmpbp, sizeof (blkptr_t));
 
 	ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp));
 	ASSERT0(BP_IS_EMBEDDED(&tmpbp));
 
 	zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac);
 
 	/*
 	 * We always MAC blk_prop in LE to ensure portability. This
 	 * must be done after decoding the mac, since the endianness
 	 * will get zero'd out here.
 	 */
 	zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version);
 	bab->bab_prop = LE_64(tmpbp.blk_prop);
 	bab->bab_pad = 0ULL;
 
 	/* version 0 did not include the padding */
 	*bab_len = sizeof (blkptr_auth_buf_t);
 	if (version == 0)
 		*bab_len -= sizeof (uint64_t);
 }
 
 static int
 zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	int ret;
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 	crypto_data_t cd;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 	cd.cd_length = bab_len;
 	cd.cd_raw.iov_base = (char *)&bab;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 static void
 zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	SHA2Update(ctx, &bab, bab_len);
 }
 
 static void
 zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version,
     boolean_t should_bswap, blkptr_t *bp)
 {
 	uint_t bab_len;
 	blkptr_auth_buf_t bab;
 
 	zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len);
 	memcpy(*aadp, &bab, bab_len);
 	*aadp += bab_len;
 	*aad_len += bab_len;
 }
 
 static int
 zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version,
     boolean_t should_bswap, dnode_phys_t *dnp)
 {
 	int ret, i;
 	dnode_phys_t *adnp, tmp_dncore;
 	size_t dn_core_size = offsetof(dnode_phys_t, dn_blkptr);
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	crypto_data_t cd;
 
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 
 	/*
 	 * Authenticate the core dnode (masking out non-portable bits).
 	 * We only copy the first 64 bytes we operate on to avoid the overhead
 	 * of copying 512-64 unneeded bytes. The compiler seems to be fine
 	 * with that.
 	 */
 	memcpy(&tmp_dncore, dnp, dn_core_size);
 	adnp = &tmp_dncore;
 
 	if (le_bswap) {
 		adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec);
 		adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen);
 		adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid);
 		adnp->dn_used = BSWAP_64(adnp->dn_used);
 	}
 	adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 	adnp->dn_used = 0;
 
 	cd.cd_length = dn_core_size;
 	cd.cd_raw.iov_base = (char *)adnp;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	for (i = 0; i < dnp->dn_nblkptr; i++) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, &dnp->dn_blkptr[i]);
 		if (ret != 0)
 			goto error;
 	}
 
 	if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 		ret = zio_crypt_bp_do_hmac_updates(ctx, version,
 		    should_bswap, DN_SPILL_BLKPTR(dnp));
 		if (ret != 0)
 			goto error;
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * objset_phys_t blocks introduce a number of exceptions to the normal
  * authentication process. objset_phys_t's contain 2 separate HMACS for
  * protecting the integrity of their data. The portable_mac protects the
  * metadnode. This MAC can be sent with a raw send and protects against
  * reordering of data within the metadnode. The local_mac protects the user
  * accounting objects which are not sent from one system to another.
  *
  * In addition, objset blocks are the only blocks that can be modified and
  * written to disk without the key loaded under certain circumstances. During
  * zil_claim() we need to be able to update the zil_header_t to complete
  * claiming log blocks and during raw receives we need to write out the
  * portable_mac from the send file. Both of these actions are possible
  * because these fields are not protected by either MAC so neither one will
  * need to modify the MACs without the key. However, when the modified blocks
  * are written out they will be byteswapped into the host machine's native
  * endianness which will modify fields protected by the MAC. As a result, MAC
  * calculation for objset blocks works slightly differently from other block
  * types. Where other block types MAC the data in whatever endianness is
  * written to disk, objset blocks always MAC little endian version of their
  * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP()
  * and le_bswap indicates whether a byteswap is needed to get this block
  * into little endian format.
  */
 int
 zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen,
     boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac)
 {
 	int ret;
 	crypto_mechanism_t mech;
 	crypto_context_t ctx;
 	crypto_data_t cd;
 	objset_phys_t *osp = data;
 	uint64_t intval;
 	boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER);
 	uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH];
 	uint8_t raw_local_mac[SHA512_DIGEST_LENGTH];
 
 	/* initialize HMAC mechanism */
 	mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC);
 	mech.cm_param = NULL;
 	mech.cm_param_len = 0;
 
 	cd.cd_format = CRYPTO_DATA_RAW;
 	cd.cd_offset = 0;
 
 	/* calculate the portable MAC from the portable fields and metadnode */
 	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the os_type */
 	intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type);
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in fields from the metadnode */
 	ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 	    should_bswap, &osp->os_meta_dnode);
 	if (ret)
 		goto error;
 
 	/* store the final digest in a temporary buffer and copy what we need */
 	cd.cd_length = SHA512_DIGEST_LENGTH;
 	cd.cd_raw.iov_base = (char *)raw_portable_mac;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_final(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(portable_mac, raw_portable_mac, ZIO_OBJSET_MAC_LEN);
 
 	/*
 	 * This is necessary here as we check next whether
 	 * OBJSET_FLAG_USERACCOUNTING_COMPLETE is set in order to
 	 * decide if the local_mac should be zeroed out. That flag will always
 	 * be set by dmu_objset_id_quota_upgrade_cb() and
 	 * dmu_objset_userspace_upgrade_cb() if useraccounting has been
 	 * completed.
 	 */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	boolean_t uacct_incomplete =
 	    !(intval & OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 
 	/*
 	 * The local MAC protects the user, group and project accounting.
 	 * If these objects are not present, the local MAC is zeroed out.
 	 */
 	if (uacct_incomplete ||
 	    (datalen >= OBJSET_PHYS_SIZE_V3 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_projectused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen >= OBJSET_PHYS_SIZE_V2 &&
 	    osp->os_userused_dnode.dn_type == DMU_OT_NONE &&
 	    osp->os_groupused_dnode.dn_type == DMU_OT_NONE) ||
 	    (datalen <= OBJSET_PHYS_SIZE_V1)) {
 		memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 		return (0);
 	}
 
 	/* calculate the local MAC from the userused and groupused dnodes */
 	ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in the non-portable os_flags */
 	intval = osp->os_flags;
 	if (should_bswap)
 		intval = BSWAP_64(intval);
 	intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK;
 	if (!ZFS_HOST_BYTEORDER)
 		intval = BSWAP_64(intval);
 
 	cd.cd_length = sizeof (uint64_t);
 	cd.cd_raw.iov_base = (char *)&intval;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_update(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	/* add in fields from the user accounting dnodes */
 	if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_userused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_groupused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE &&
 	    datalen >= OBJSET_PHYS_SIZE_V3) {
 		ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version,
 		    should_bswap, &osp->os_projectused_dnode);
 		if (ret)
 			goto error;
 	}
 
 	/* store the final digest in a temporary buffer and copy what we need */
 	cd.cd_length = SHA512_DIGEST_LENGTH;
 	cd.cd_raw.iov_base = (char *)raw_local_mac;
 	cd.cd_raw.iov_len = cd.cd_length;
 
 	ret = crypto_mac_final(ctx, &cd);
 	if (ret != CRYPTO_SUCCESS) {
 		ret = SET_ERROR(EIO);
 		goto error;
 	}
 
 	memcpy(local_mac, raw_local_mac, ZIO_OBJSET_MAC_LEN);
 
 	return (0);
 
 error:
 	memset(portable_mac, 0, ZIO_OBJSET_MAC_LEN);
 	memset(local_mac, 0, ZIO_OBJSET_MAC_LEN);
 	return (ret);
 }
 
 static void
 zio_crypt_destroy_uio(zfs_uio_t *uio)
 {
 	if (uio->uio_iov)
 		kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t));
 }
 
 /*
  * This function parses an uncompressed indirect block and returns a checksum
  * of all the portable fields from all of the contained bps. The portable
  * fields are the MAC and all of the fields from blk_prop except for the dedup,
  * checksum, and psize bits. For an explanation of the purpose of this, see
  * the comment block on object set authentication.
  */
 static int
 zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf,
     uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum)
 {
 	blkptr_t *bp;
 	int i, epb = datalen >> SPA_BLKPTRSHIFT;
 	SHA2_CTX ctx;
 	uint8_t digestbuf[SHA512_DIGEST_LENGTH];
 
 	/* checksum all of the MACs from the layer below */
 	SHA2Init(SHA512, &ctx);
 	for (i = 0, bp = buf; i < epb; i++, bp++) {
 		zio_crypt_bp_do_indrect_checksum_updates(&ctx, version,
 		    byteswap, bp);
 	}
 	SHA2Final(digestbuf, &ctx);
 
 	if (generate) {
 		memcpy(cksum, digestbuf, ZIO_DATA_MAC_LEN);
 		return (0);
 	}
 
 	if (memcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0)
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 
 	/*
 	 * Unfortunately, callers of this function will not always have
 	 * easy access to the on-disk format version. This info is
 	 * normally found in the DSL Crypto Key, but the checksum-of-MACs
 	 * is expected to be verifiable even when the key isn't loaded.
 	 * Here, instead of doing a ZAP lookup for the version for each
 	 * zio, we simply try both existing formats.
 	 */
 	ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf,
 	    datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum);
 	if (ret == ECKSUM) {
 		ASSERT(!generate);
 		ret = zio_crypt_do_indirect_mac_checksum_impl(generate,
 		    buf, datalen, 0, byteswap, cksum);
 	}
 
 	return (ret);
 }
 
 int
 zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd,
     uint_t datalen, boolean_t byteswap, uint8_t *cksum)
 {
 	int ret;
 	void *buf;
 
 	buf = abd_borrow_buf_copy(abd, datalen);
 	ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen,
 	    byteswap, cksum);
 	abd_return_buf(abd, buf, datalen);
 
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting ZIL blocks.
  * We do not check for the older ZIL chain because the encryption feature
  * was not available before the newer ZIL chain was introduced. The goal
  * here is to encrypt everything except the blkptr_t of a lr_write_t and
  * the zil_chain_t header. Everything that is not encrypted is authenticated.
  */
 static int
 zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, zfs_uio_t *puio,
     zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len,
     boolean_t *no_crypt)
 {
 	int ret;
-	uint64_t txtype, lr_len;
+	uint64_t txtype, lr_len, nused;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
 	uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp;
 	zil_chain_t *zilc;
 	lr_t *lr;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 
 	/* cipherbuf always needs an extra iovec for the MAC */
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 		nr_src = 0;
 		nr_dst = 1;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 		nr_src = 1;
 		nr_dst = 0;
 	}
 	memset(dst, 0, datalen);
 
 	/* find the start and end record of the log block */
 	zilc = (zil_chain_t *)src;
 	slrp = src + sizeof (zil_chain_t);
 	aadp = aadbuf;
-	blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused);
+	ASSERT3U(nused, >=, sizeof (zil_chain_t));
+	ASSERT3U(nused, <=, datalen);
+	blkend = src + nused;
 
 	/* calculate the number of encrypted iovecs we will need */
 	for (; slrp < blkend; slrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
+		ASSERT3U(lr_len, >=, sizeof (lr_t));
+		ASSERT3U(lr_len, <=, blkend - slrp);
 
 		nr_iovecs++;
 		if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t))
 			nr_iovecs++;
 	}
 
 	nr_src += nr_iovecs;
 	nr_dst += nr_iovecs;
 
 	/* allocate the iovec arrays */
 	if (nr_src != 0) {
 		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
 		if (src_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	if (nr_dst != 0) {
 		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
 		if (dst_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	/*
 	 * Copy the plain zil header over and authenticate everything except
 	 * the checksum that will store our MAC. If we are writing the data
 	 * the embedded checksum will not have been calculated yet, so we don't
 	 * authenticate that.
 	 */
 	memcpy(dst, src, sizeof (zil_chain_t));
 	memcpy(aadp, src, sizeof (zil_chain_t) - sizeof (zio_eck_t));
 	aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 	aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t);
 
 	/* loop over records again, filling in iovecs */
 	nr_iovecs = 0;
 	slrp = src + sizeof (zil_chain_t);
 	dlrp = dst + sizeof (zil_chain_t);
 
 	for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) {
 		lr = (lr_t *)slrp;
 
 		if (!byteswap) {
 			txtype = lr->lrc_txtype;
 			lr_len = lr->lrc_reclen;
 		} else {
 			txtype = BSWAP_64(lr->lrc_txtype);
 			lr_len = BSWAP_64(lr->lrc_reclen);
 		}
 
 		/* copy the common lr_t */
 		memcpy(dlrp, slrp, sizeof (lr_t));
 		memcpy(aadp, slrp, sizeof (lr_t));
 		aadp += sizeof (lr_t);
 		aad_len += sizeof (lr_t);
 
 		ASSERT3P(src_iovecs, !=, NULL);
 		ASSERT3P(dst_iovecs, !=, NULL);
 
 		/*
 		 * If this is a TX_WRITE record we want to encrypt everything
 		 * except the bp if exists. If the bp does exist we want to
 		 * authenticate it.
 		 */
 		if (txtype == TX_WRITE) {
 			const size_t o = offsetof(lr_write_t, lr_blkptr);
 			crypt_len = o - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			/* copy the bp now since it will not be encrypted */
 			memcpy(dlrp + o, slrp + o, sizeof (blkptr_t));
 			memcpy(aadp, slrp + o, sizeof (blkptr_t));
 			aadp += sizeof (blkptr_t);
 			aad_len += sizeof (blkptr_t);
 			nr_iovecs++;
 			total_len += crypt_len;
 
 			if (lr_len != sizeof (lr_write_t)) {
 				crypt_len = lr_len - sizeof (lr_write_t);
 				src_iovecs[nr_iovecs].iov_base =
 				    slrp + sizeof (lr_write_t);
 				src_iovecs[nr_iovecs].iov_len = crypt_len;
 				dst_iovecs[nr_iovecs].iov_base =
 				    dlrp + sizeof (lr_write_t);
 				dst_iovecs[nr_iovecs].iov_len = crypt_len;
 				nr_iovecs++;
 				total_len += crypt_len;
 			}
 		} else if (txtype == TX_CLONE_RANGE) {
 			const size_t o = offsetof(lr_clone_range_t, lr_nbps);
 			crypt_len = o - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			/* copy the bps now since they will not be encrypted */
 			memcpy(dlrp + o, slrp + o, lr_len - o);
 			memcpy(aadp, slrp + o, lr_len - o);
 			aadp += lr_len - o;
 			aad_len += lr_len - o;
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			crypt_len = lr_len - sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 			nr_iovecs++;
 			total_len += crypt_len;
 		}
 	}
 
 	*no_crypt = (nr_iovecs == 0);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 
 	if (encrypt) {
 		puio->uio_iov = src_iovecs;
 		puio->uio_iovcnt = nr_src;
 		cuio->uio_iov = dst_iovecs;
 		cuio->uio_iovcnt = nr_dst;
 	} else {
 		puio->uio_iov = dst_iovecs;
 		puio->uio_iovcnt = nr_dst;
 		cuio->uio_iov = src_iovecs;
 		cuio->uio_iovcnt = nr_src;
 	}
 
 	return (0);
 
 error:
 	zio_buf_free(aadbuf, datalen);
 	if (src_iovecs != NULL)
 		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
 	if (dst_iovecs != NULL)
 		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
 
 	*enc_len = 0;
 	*authbuf = NULL;
 	*auth_len = 0;
 	*no_crypt = B_FALSE;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 /*
  * Special case handling routine for encrypting / decrypting dnode blocks.
  */
 static int
 zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len, uint8_t **authbuf,
     uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	uint_t nr_src, nr_dst, crypt_len;
 	uint_t aad_len = 0, nr_iovecs = 0, total_len = 0;
 	uint_t i, j, max_dnp = datalen >> DNODE_SHIFT;
 	iovec_t *src_iovecs = NULL, *dst_iovecs = NULL;
 	uint8_t *src, *dst, *aadp;
 	dnode_phys_t *dnp, *adnp, *sdnp, *ddnp;
 	uint8_t *aadbuf = zio_buf_alloc(datalen);
 
 	if (encrypt) {
 		src = plainbuf;
 		dst = cipherbuf;
 		nr_src = 0;
 		nr_dst = 1;
 	} else {
 		src = cipherbuf;
 		dst = plainbuf;
 		nr_src = 1;
 		nr_dst = 0;
 	}
 
 	sdnp = (dnode_phys_t *)src;
 	ddnp = (dnode_phys_t *)dst;
 	aadp = aadbuf;
 
 	/*
 	 * Count the number of iovecs we will need to do the encryption by
 	 * counting the number of bonus buffers that need to be encrypted.
 	 */
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		/*
 		 * This block may still be byteswapped. However, all of the
 		 * values we use are either uint8_t's (for which byteswapping
 		 * is a noop) or a * != 0 check, which will work regardless
 		 * of whether or not we byteswap.
 		 */
 		if (sdnp[i].dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) &&
 		    sdnp[i].dn_bonuslen != 0) {
 			nr_iovecs++;
 		}
 	}
 
 	nr_src += nr_iovecs;
 	nr_dst += nr_iovecs;
 
 	if (nr_src != 0) {
 		src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP);
 		if (src_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	if (nr_dst != 0) {
 		dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP);
 		if (dst_iovecs == NULL) {
 			ret = SET_ERROR(ENOMEM);
 			goto error;
 		}
 	}
 
 	nr_iovecs = 0;
 
 	/*
 	 * Iterate through the dnodes again, this time filling in the uios
 	 * we allocated earlier. We also concatenate any data we want to
 	 * authenticate onto aadbuf.
 	 */
 	for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) {
 		dnp = &sdnp[i];
 
 		/* copy over the core fields and blkptrs (kept as plaintext) */
 		memcpy(&ddnp[i], dnp,
 		    (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp);
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			memcpy(DN_SPILL_BLKPTR(&ddnp[i]), DN_SPILL_BLKPTR(dnp),
 			    sizeof (blkptr_t));
 		}
 
 		/*
 		 * Handle authenticated data. We authenticate everything in
 		 * the dnode that can be brought over when we do a raw send.
 		 * This includes all of the core fields as well as the MACs
 		 * stored in the bp checksums and all of the portable bits
 		 * from blk_prop. We include the dnode padding here in case it
 		 * ever gets used in the future. Some dn_flags and dn_used are
 		 * not portable so we mask those out values out of the
 		 * authenticated data.
 		 */
 		crypt_len = offsetof(dnode_phys_t, dn_blkptr);
 		memcpy(aadp, dnp, crypt_len);
 		adnp = (dnode_phys_t *)aadp;
 		adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK;
 		adnp->dn_used = 0;
 		aadp += crypt_len;
 		aad_len += crypt_len;
 
 		for (j = 0; j < dnp->dn_nblkptr; j++) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, &dnp->dn_blkptr[j]);
 		}
 
 		if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			zio_crypt_bp_do_aad_updates(&aadp, &aad_len,
 			    version, byteswap, DN_SPILL_BLKPTR(dnp));
 		}
 
 		/*
 		 * If this bonus buffer needs to be encrypted, we prepare an
 		 * iovec_t. The encryption / decryption functions will fill
 		 * this in for us with the encrypted or decrypted data.
 		 * Otherwise we add the bonus buffer to the authenticated
 		 * data buffer and copy it over to the destination. The
 		 * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that
 		 * we can guarantee alignment with the AES block size
 		 * (128 bits).
 		 */
 		crypt_len = DN_MAX_BONUS_LEN(dnp);
 		if (dnp->dn_type != DMU_OT_NONE &&
 		    DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) &&
 		    dnp->dn_bonuslen != 0) {
 			ASSERT3U(nr_iovecs, <, nr_src);
 			ASSERT3U(nr_iovecs, <, nr_dst);
 			ASSERT3P(src_iovecs, !=, NULL);
 			ASSERT3P(dst_iovecs, !=, NULL);
 			src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp);
 			src_iovecs[nr_iovecs].iov_len = crypt_len;
 			dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]);
 			dst_iovecs[nr_iovecs].iov_len = crypt_len;
 
 			nr_iovecs++;
 			total_len += crypt_len;
 		} else {
 			memcpy(DN_BONUS(&ddnp[i]), DN_BONUS(dnp), crypt_len);
 			memcpy(aadp, DN_BONUS(dnp), crypt_len);
 			aadp += crypt_len;
 			aad_len += crypt_len;
 		}
 	}
 
 	*no_crypt = (nr_iovecs == 0);
 	*enc_len = total_len;
 	*authbuf = aadbuf;
 	*auth_len = aad_len;
 
 	if (encrypt) {
 		puio->uio_iov = src_iovecs;
 		puio->uio_iovcnt = nr_src;
 		cuio->uio_iov = dst_iovecs;
 		cuio->uio_iovcnt = nr_dst;
 	} else {
 		puio->uio_iov = dst_iovecs;
 		puio->uio_iovcnt = nr_dst;
 		cuio->uio_iov = src_iovecs;
 		cuio->uio_iovcnt = nr_src;
 	}
 
 	return (0);
 
 error:
 	zio_buf_free(aadbuf, datalen);
 	if (src_iovecs != NULL)
 		kmem_free(src_iovecs, nr_src * sizeof (iovec_t));
 	if (dst_iovecs != NULL)
 		kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t));
 
 	*enc_len = 0;
 	*authbuf = NULL;
 	*auth_len = 0;
 	*no_crypt = B_FALSE;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 static int
 zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf,
     uint8_t *cipherbuf, uint_t datalen, zfs_uio_t *puio, zfs_uio_t *cuio,
     uint_t *enc_len)
 {
 	(void) encrypt;
 	int ret;
 	uint_t nr_plain = 1, nr_cipher = 2;
 	iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL;
 
 	/* allocate the iovecs for the plain and cipher data */
 	plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!plain_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t),
 	    KM_SLEEP);
 	if (!cipher_iovecs) {
 		ret = SET_ERROR(ENOMEM);
 		goto error;
 	}
 
 	plain_iovecs[0].iov_base = plainbuf;
 	plain_iovecs[0].iov_len = datalen;
 	cipher_iovecs[0].iov_base = cipherbuf;
 	cipher_iovecs[0].iov_len = datalen;
 
 	*enc_len = datalen;
 	puio->uio_iov = plain_iovecs;
 	puio->uio_iovcnt = nr_plain;
 	cuio->uio_iov = cipher_iovecs;
 	cuio->uio_iovcnt = nr_cipher;
 
 	return (0);
 
 error:
 	if (plain_iovecs != NULL)
 		kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t));
 	if (cipher_iovecs != NULL)
 		kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t));
 
 	*enc_len = 0;
 	puio->uio_iov = NULL;
 	puio->uio_iovcnt = 0;
 	cuio->uio_iov = NULL;
 	cuio->uio_iovcnt = 0;
 	return (ret);
 }
 
 /*
  * This function builds up the plaintext (puio) and ciphertext (cuio) uios so
  * that they can be used for encryption and decryption by zio_do_crypt_uio().
  * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks
  * requiring special handling to parse out pieces that are to be encrypted. The
  * authbuf is used by these special cases to store additional authenticated
  * data (AAD) for the encryption modes.
  */
 static int
 zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot,
     uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap,
     uint8_t *mac, zfs_uio_t *puio, zfs_uio_t *cuio, uint_t *enc_len,
     uint8_t **authbuf, uint_t *auth_len, boolean_t *no_crypt)
 {
 	int ret;
 	iovec_t *mac_iov;
 
 	ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE);
 
 	/* route to handler */
 	switch (ot) {
 	case DMU_OT_INTENT_LOG:
 		ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf,
 		    datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len,
 		    no_crypt);
 		break;
 	case DMU_OT_DNODE:
 		ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf,
 		    cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf,
 		    auth_len, no_crypt);
 		break;
 	default:
 		ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf,
 		    datalen, puio, cuio, enc_len);
 		*authbuf = NULL;
 		*auth_len = 0;
 		*no_crypt = B_FALSE;
 		break;
 	}
 
 	if (ret != 0)
 		goto error;
 
 	/* populate the uios */
 	puio->uio_segflg = UIO_SYSSPACE;
 	cuio->uio_segflg = UIO_SYSSPACE;
 
 	mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]);
 	mac_iov->iov_base = mac;
 	mac_iov->iov_len = ZIO_DATA_MAC_LEN;
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 /*
  * Primary encryption / decryption entrypoint for zio data.
  */
 int
 zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key,
     dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv,
     uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf,
     boolean_t *no_crypt)
 {
 	int ret;
 	boolean_t locked = B_FALSE;
 	uint64_t crypt = key->zk_crypt;
 	uint_t keydata_len = zio_crypt_table[crypt].ci_keylen;
 	uint_t enc_len, auth_len;
 	zfs_uio_t puio, cuio;
 	uint8_t enc_keydata[MASTER_KEY_MAX_LEN];
 	crypto_key_t tmp_ckey, *ckey = NULL;
 	crypto_ctx_template_t tmpl;
 	uint8_t *authbuf = NULL;
 
 	memset(&puio, 0, sizeof (puio));
 	memset(&cuio, 0, sizeof (cuio));
 
 	/*
 	 * If the needed key is the current one, just use it. Otherwise we
 	 * need to generate a temporary one from the given salt + master key.
 	 * If we are encrypting, we must return a copy of the current salt
 	 * so that it can be stored in the blkptr_t.
 	 */
 	rw_enter(&key->zk_salt_lock, RW_READER);
 	locked = B_TRUE;
 
 	if (memcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) {
 		ckey = &key->zk_current_key;
 		tmpl = key->zk_current_tmpl;
 	} else {
 		rw_exit(&key->zk_salt_lock);
 		locked = B_FALSE;
 
 		ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0,
 		    salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len);
 		if (ret != 0)
 			goto error;
 
 		tmp_ckey.ck_data = enc_keydata;
 		tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len);
 
 		ckey = &tmp_ckey;
 		tmpl = NULL;
 	}
 
 	/*
 	 * Attempt to use QAT acceleration if we can. We currently don't
 	 * do this for metadnode and ZIL blocks, since they have a much
 	 * more involved buffer layout and the qat_crypt() function only
 	 * works in-place.
 	 */
 	if (qat_crypt_use_accel(datalen) &&
 	    ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) {
 		uint8_t *srcbuf, *dstbuf;
 
 		if (encrypt) {
 			srcbuf = plainbuf;
 			dstbuf = cipherbuf;
 		} else {
 			srcbuf = cipherbuf;
 			dstbuf = plainbuf;
 		}
 
 		ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf,
 		    dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen);
 		if (ret == CPA_STATUS_SUCCESS) {
 			if (locked) {
 				rw_exit(&key->zk_salt_lock);
 				locked = B_FALSE;
 			}
 
 			return (0);
 		}
 		/* If the hardware implementation fails fall back to software */
 	}
 
 	/* create uios for encryption */
 	ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf,
 	    cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len,
 	    &authbuf, &auth_len, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	/* perform the encryption / decryption in software */
 	ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len,
 	    &puio, &cuio, authbuf, auth_len);
 	if (ret != 0)
 		goto error;
 
 	if (locked) {
 		rw_exit(&key->zk_salt_lock);
 	}
 
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (0);
 
 error:
 	if (locked)
 		rw_exit(&key->zk_salt_lock);
 	if (authbuf != NULL)
 		zio_buf_free(authbuf, datalen);
 	if (ckey == &tmp_ckey)
 		memset(enc_keydata, 0, keydata_len);
 	zio_crypt_destroy_uio(&puio);
 	zio_crypt_destroy_uio(&cuio);
 
 	return (ret);
 }
 
 /*
  * Simple wrapper around zio_do_crypt_data() to work with abd's instead of
  * linear buffers.
  */
 int
 zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot,
     boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac,
     uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt)
 {
 	int ret;
 	void *ptmp, *ctmp;
 
 	if (encrypt) {
 		ptmp = abd_borrow_buf_copy(pabd, datalen);
 		ctmp = abd_borrow_buf(cabd, datalen);
 	} else {
 		ptmp = abd_borrow_buf(pabd, datalen);
 		ctmp = abd_borrow_buf_copy(cabd, datalen);
 	}
 
 	ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac,
 	    datalen, ptmp, ctmp, no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (0);
 
 error:
 	if (encrypt) {
 		abd_return_buf(pabd, ptmp, datalen);
 		abd_return_buf_copy(cabd, ctmp, datalen);
 	} else {
 		abd_return_buf_copy(pabd, ptmp, datalen);
 		abd_return_buf(cabd, ctmp, datalen);
 	}
 
 	return (ret);
 }
 
 #if defined(_KERNEL)
 /* CSTYLED */
 module_param(zfs_key_max_salt_uses, ulong, 0644);
 MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value "
 	"can be used for generating encryption keys before it is rotated");
 #endif
diff --git a/sys/contrib/openzfs/module/zfs/brt.c b/sys/contrib/openzfs/module/zfs/brt.c
index b0529521ec76..759bc8d2e2b8 100644
--- a/sys/contrib/openzfs/module/zfs/brt.c
+++ b/sys/contrib/openzfs/module/zfs/brt.c
@@ -1,1753 +1,1753 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <sys/ddt.h>
 #include <sys/bitmap.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_impl.h>
 #include <sys/kstat.h>
 #include <sys/wmsum.h>
 
 /*
  * Block Cloning design.
  *
  * Block Cloning allows to manually clone a file (or a subset of its blocks)
  * into another (or the same) file by just creating additional references to
  * the data blocks without copying the data itself. Those references are kept
  * in the Block Reference Tables (BRTs).
  *
  * In many ways this is similar to the existing deduplication, but there are
  * some important differences:
  *
  * - Deduplication is automatic and Block Cloning is not - one has to use a
  *   dedicated system call(s) to clone the given file/blocks.
  * - Deduplication keeps all data blocks in its table, even those referenced
  *   just once. Block Cloning creates an entry in its tables only when there
  *   are at least two references to the given data block. If the block was
  *   never explicitly cloned or the second to last reference was dropped,
  *   there will be neither space nor performance overhead.
  * - Deduplication needs data to work - one needs to pass real data to the
  *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
  *   data, just block pointers to the data, so it is extremely fast, as we pay
  *   neither the cost of reading the data, nor the cost of writing the data -
  *   we operate exclusively on metadata.
  * - If the D (dedup) bit is not set in the block pointer, it means that
  *   the block is not in the dedup table (DDT) and we won't consult the DDT
  *   when we need to free the block. Block Cloning must be consulted on every
  *   free, because we cannot modify the source BP (eg. by setting something
  *   similar to the D bit), thus we have no hint if the block is in the
  *   Block Reference Table (BRT), so we need to look into the BRT. There is
  *   an optimization in place that allows us to eliminate the majority of BRT
  *   lookups which is described below in the "Minimizing free penalty" section.
  * - The BRT entry is much smaller than the DDT entry - for BRT we only store
  *   64bit offset and 64bit reference counter.
  * - Dedup keys are cryptographic hashes, so two blocks that are close to each
  *   other on disk are most likely in totally different parts of the DDT.
  *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
  *   from one file should have BRT entries close to each other.
  * - Scrub will only do a single pass over a block that is referenced multiple
  *   times in the DDT. Unfortunately it is not currently (if at all) possible
  *   with Block Cloning and block referenced multiple times will be scrubbed
  *   multiple times. The new, sorted scrub should be able to eliminate
  *   duplicated reads given enough memory.
  * - Deduplication requires cryptographically strong hash as a checksum or
  *   additional data verification. Block Cloning works with any checksum
  *   algorithm or even with checksumming disabled.
  *
  * As mentioned above, the BRT entries are much smaller than the DDT entries.
  * To uniquely identify a block we just need its vdev id and offset. We also
  * need to maintain a reference counter. The vdev id will often repeat, as there
  * is a small number of top-level VDEVs and a large number of blocks stored in
  * each VDEV. We take advantage of that to reduce the BRT entry size further by
  * maintaining one BRT for each top-level VDEV, so we can then have only offset
  * and counter as the BRT entry.
  *
  * Minimizing free penalty.
  *
  * Block Cloning allows creating additional references to any existing block.
  * When we free a block there is no hint in the block pointer whether the block
  * was cloned or not, so on each free we have to check if there is a
  * corresponding entry in the BRT or not. If there is, we need to decrease
  * the reference counter. Doing BRT lookup on every free can potentially be
  * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
  * This is the main problem with deduplication, so we've learned our lesson and
  * try not to repeat the same mistake here. How do we do that? We divide each
  * top-level VDEV into 16MB regions. For each region we maintain a counter that
  * is a sum of all the BRT entries that have offsets within the region. This
  * creates the entries count array of 16bit numbers for each top-level VDEV.
  * The entries count array is always kept in memory and updated on disk in the
  * same transaction group as the BRT updates to keep everything in-sync. We can
  * keep the array in memory, because it is very small. With 16MB regions and
  * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
  * the region size even further in the future). Now, when we want to free
  * a block, we first consult the array. If the counter for the whole region is
  * zero, there is no need to look for the BRT entry, as there isn't one for
  * sure. If the counter for the region is greater than zero, only then we will
  * do a BRT lookup and if an entry is found we will decrease the reference
  * counter in the BRT entry and in the entry counters array.
  *
  * The entry counters array is small, but can potentially be larger for very
  * large VDEVs or smaller regions. In this case we don't want to rewrite entire
  * array on every change. We then divide the array into 32kB block and keep
  * a bitmap of dirty blocks within a transaction group. When we sync the
  * transaction group we can only update the parts of the entry counters array
  * that were modified. Note: Keeping track of the dirty parts of the entry
  * counters array is implemented, but updating only parts of the array on disk
  * is not yet implemented - for now we will update entire array if there was
  * any change.
  *
  * The implementation tries to be economic: if BRT is not used, or no longer
  * used, there will be no entries in the MOS and no additional memory used (eg.
  * the entry counters array is only allocated if needed).
  *
  * Interaction between Deduplication and Block Cloning.
  *
  * If both functionalities are in use, we could end up with a block that is
  * referenced multiple times in both DDT and BRT. When we free one of the
  * references we couldn't tell where it belongs, so we would have to decide
  * what table takes the precedence: do we first clear DDT references or BRT
  * references? To avoid this dilemma BRT cooperates with DDT - if a given block
  * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
  * lookup DDT entry instead and increase the counter there. No BRT entry
  * will be created for a block which has the D (dedup) bit set.
  * BRT may be more efficient for manual deduplication, but if the block is
  * already in the DDT, then creating additional BRT entry would be less
  * efficient. This clever idea was proposed by Allan Jude.
  *
  * Block Cloning across datasets.
  *
  * Block Cloning is not limited to cloning blocks within the same dataset.
  * It is possible (and very useful) to clone blocks between different datasets.
  * One use case is recovering files from snapshots. By cloning the files into
  * dataset we need no additional storage. Without Block Cloning we would need
  * additional space for those files.
  * Another interesting use case is moving the files between datasets
  * (copying the file content to the new dataset and removing the source file).
  * In that case Block Cloning will only be used briefly, because the BRT entries
  * will be removed when the source is removed.
  * Note: currently it is not possible to clone blocks between encrypted
  * datasets, even if those datasets use the same encryption key (this includes
  * snapshots of encrypted datasets). Cloning blocks between datasets that use
  * the same keys should be possible and should be implemented in the future.
  *
  * Block Cloning flow through ZFS layers.
  *
  * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
  * blocks. As of this writing no interface is implemented that allows for block
  * cloning within a ZVOL.
  * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
  * for blocking cloning.
  *
  *	ssize_t
  *	copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
  *	                size_t len, unsigned int flags);
  *
  * Even though offsets and length represent bytes, they have to be
  * block-aligned or we will return an error so the upper layer can
  * fallback to the generic mechanism that will just copy the data.
  * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
  * This function was implemented based on zfs_write(), but instead of writing
  * the given data we first read block pointers using the new dmu_read_l0_bps()
  * function from the source file. Once we have BPs from the source file we call
  * the dmu_brt_clone() function on the destination file. This function
  * allocates BPs for us. We iterate over all source BPs. If the given BP is
  * a hole or an embedded block, we just copy BP as-is. If it points to a real
  * data we place this BP on a BRT pending list using the brt_pending_add()
  * function.
  *
  * We use this pending list to keep track of all BPs that got new references
  * within this transaction group.
  *
  * Some special cases to consider and how we address them:
  * - The block we want to clone may have been created within the same
  *   transaction group that we are trying to clone. Such block has no BP
  *   allocated yet, so cannot be immediately cloned. We return EAGAIN.
  * - The block we want to clone may have been modified within the same
  *   transaction group. We return EAGAIN.
  * - A block may be cloned multiple times during one transaction group (that's
  *   why pending list is actually a tree and not an append-only list - this
  *   way we can figure out faster if this block is cloned for the first time
  *   in this txg or consecutive time).
  * - A block may be cloned and freed within the same transaction group
  *   (see dbuf_undirty()).
  * - A block may be cloned and within the same transaction group the clone
  *   can be cloned again (see dmu_read_l0_bps()).
  * - A file might have been deleted, but the caller still has a file descriptor
  *   open to this file and clones it.
  *
  * When we free a block we have an additional step in the ZIO pipeline where we
  * call the zio_brt_free() function. We then call the brt_entry_decref()
  * that loads the corresponding BRT entry (if one exists) and decreases
  * reference counter. If this is not the last reference we will stop ZIO
  * pipeline here. If this is the last reference or the block is not in the
  * BRT, we continue the pipeline and free the block as usual.
  *
  * At the beginning of spa_sync() where there can be no more block cloning,
  * but before issuing frees we call brt_pending_apply(). This function applies
  * all the new clones to the BRT table - we load BRT entries and update
  * reference counters. To sync new BRT entries to disk, we use brt_sync()
  * function. This function will sync all dirty per-top-level-vdev BRTs,
  * the entry counters arrays, etc.
  *
  * Block Cloning and ZIL.
  *
  * Every clone operation is divided into chunks (similar to write) and each
  * chunk is cloned in a separate transaction. The chunk size is determined by
  * how many BPs we can fit into a single ZIL entry.
  * Replaying clone operation is different from the regular clone operation,
  * as when we log clone operations we cannot use the source object - it may
  * reside on a different dataset, so we log BPs we want to clone.
  * The ZIL is replayed when we mount the given dataset, not when the pool is
  * imported. Taking this into account it is possible that the pool is imported
  * without mounting datasets and the source dataset is destroyed before the
  * destination dataset is mounted and its ZIL replayed.
  * To address this situation we leverage zil_claim() mechanism where ZFS will
  * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
- * entries, we will bump reference counters for their BPs in the BRT and then
- * on mount and ZIL replay we will just attach BPs to the file without
- * bumping reference counters.
- * Note it is still possible that after zil_claim() we never mount the
- * destination, so we never replay its ZIL and we destroy it. This way we would
- * end up with leaked references in BRT. We address that too as ZFS gives us
- * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ * entries, we will bump reference counters for their BPs in the BRT.  Then
+ * on mount and ZIL replay we bump the reference counters once more, while the
+ * first references are dropped during ZIL destroy by zil_free_clone_range().
+ * It is possible that after zil_claim() we never mount the destination, so
+ * we never replay its ZIL and just destroy it.  In this case the only taken
+ * references will be dropped by zil_free_clone_range(), since the cloning is
+ * not going to ever take place.
  */
 
 static kmem_cache_t *brt_entry_cache;
 static kmem_cache_t *brt_pending_entry_cache;
 
 /*
  * Enable/disable prefetching of BRT entries that we are going to modify.
  */
 int zfs_brt_prefetch = 1;
 
 #ifdef ZFS_DEBUG
 #define	BRT_DEBUG(...)	do {						\
 	if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {				\
 		__dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
 	}								\
 } while (0)
 #else
 #define	BRT_DEBUG(...)	do { } while (0)
 #endif
 
 int brt_zap_leaf_blockshift = 12;
 int brt_zap_indirect_blockshift = 12;
 
 static kstat_t	*brt_ksp;
 
 typedef struct brt_stats {
 	kstat_named_t brt_addref_entry_in_memory;
 	kstat_named_t brt_addref_entry_not_on_disk;
 	kstat_named_t brt_addref_entry_on_disk;
 	kstat_named_t brt_addref_entry_read_lost_race;
 	kstat_named_t brt_decref_entry_in_memory;
 	kstat_named_t brt_decref_entry_loaded_from_disk;
 	kstat_named_t brt_decref_entry_not_in_memory;
 	kstat_named_t brt_decref_entry_not_on_disk;
 	kstat_named_t brt_decref_entry_read_lost_race;
 	kstat_named_t brt_decref_entry_still_referenced;
 	kstat_named_t brt_decref_free_data_later;
 	kstat_named_t brt_decref_free_data_now;
 	kstat_named_t brt_decref_no_entry;
 } brt_stats_t;
 
 static brt_stats_t brt_stats = {
 	{ "addref_entry_in_memory",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_on_disk",		KSTAT_DATA_UINT64 },
 	{ "addref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_in_memory",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_loaded_from_disk",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_not_in_memory",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_not_on_disk",		KSTAT_DATA_UINT64 },
 	{ "decref_entry_read_lost_race",	KSTAT_DATA_UINT64 },
 	{ "decref_entry_still_referenced",	KSTAT_DATA_UINT64 },
 	{ "decref_free_data_later",		KSTAT_DATA_UINT64 },
 	{ "decref_free_data_now",		KSTAT_DATA_UINT64 },
 	{ "decref_no_entry",			KSTAT_DATA_UINT64 }
 };
 
 struct {
 	wmsum_t brt_addref_entry_in_memory;
 	wmsum_t brt_addref_entry_not_on_disk;
 	wmsum_t brt_addref_entry_on_disk;
 	wmsum_t brt_addref_entry_read_lost_race;
 	wmsum_t brt_decref_entry_in_memory;
 	wmsum_t brt_decref_entry_loaded_from_disk;
 	wmsum_t brt_decref_entry_not_in_memory;
 	wmsum_t brt_decref_entry_not_on_disk;
 	wmsum_t brt_decref_entry_read_lost_race;
 	wmsum_t brt_decref_entry_still_referenced;
 	wmsum_t brt_decref_free_data_later;
 	wmsum_t brt_decref_free_data_now;
 	wmsum_t brt_decref_no_entry;
 } brt_sums;
 
 #define	BRTSTAT_BUMP(stat)	wmsum_add(&brt_sums.stat, 1)
 
 static int brt_entry_compare(const void *x1, const void *x2);
 static int brt_pending_entry_compare(const void *x1, const void *x2);
 
 static void
 brt_rlock(brt_t *brt)
 {
 	rw_enter(&brt->brt_lock, RW_READER);
 }
 
 static void
 brt_wlock(brt_t *brt)
 {
 	rw_enter(&brt->brt_lock, RW_WRITER);
 }
 
 static void
 brt_unlock(brt_t *brt)
 {
 	rw_exit(&brt->brt_lock);
 }
 
 static uint16_t
 brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
 {
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	if (brtvd->bv_need_byteswap) {
 		return (BSWAP_16(brtvd->bv_entcount[idx]));
 	} else {
 		return (brtvd->bv_entcount[idx]);
 	}
 }
 
 static void
 brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
 {
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	if (brtvd->bv_need_byteswap) {
 		brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
 	} else {
 		brtvd->bv_entcount[idx] = entcnt;
 	}
 }
 
 static void
 brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
 {
 	uint16_t entcnt;
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	entcnt = brt_vdev_entcount_get(brtvd, idx);
 	ASSERT(entcnt < UINT16_MAX);
 
 	brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
 }
 
 static void
 brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
 {
 	uint16_t entcnt;
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	entcnt = brt_vdev_entcount_get(brtvd, idx);
 	ASSERT(entcnt > 0);
 
 	brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
 }
 
 #ifdef ZFS_DEBUG
 static void
 brt_vdev_dump(brt_t *brt)
 {
 	brt_vdev_t *brtvd;
 	uint64_t vdevid;
 
 	if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
 		return;
 	}
 
 	if (brt->brt_nvdevs == 0) {
 		zfs_dbgmsg("BRT empty");
 		return;
 	}
 
 	zfs_dbgmsg("BRT vdev dump:");
 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		uint64_t idx;
 
 		brtvd = &brt->brt_vdevs[vdevid];
 		zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
 		    "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
 		    (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
 		    brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
 		    (u_longlong_t)brtvd->bv_size,
 		    (u_longlong_t)brtvd->bv_totalcount,
 		    (u_longlong_t)brtvd->bv_nblocks,
 		    (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
 		if (brtvd->bv_totalcount > 0) {
 			zfs_dbgmsg("    entcounts:");
 			for (idx = 0; idx < brtvd->bv_size; idx++) {
 				if (brt_vdev_entcount_get(brtvd, idx) > 0) {
 					zfs_dbgmsg("      [%04llu] %hu",
 					    (u_longlong_t)idx,
 					    brt_vdev_entcount_get(brtvd, idx));
 				}
 			}
 		}
 		if (brtvd->bv_entcount_dirty) {
 			char *bitmap;
 
 			bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
 			for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
 				bitmap[idx] =
 				    BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
 			}
 			bitmap[idx] = '\0';
 			zfs_dbgmsg("    bitmap: %s", bitmap);
 			kmem_free(bitmap, brtvd->bv_nblocks + 1);
 		}
 	}
 }
 #endif
 
 static brt_vdev_t *
 brt_vdev(brt_t *brt, uint64_t vdevid)
 {
 	brt_vdev_t *brtvd;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 
 	if (vdevid < brt->brt_nvdevs) {
 		brtvd = &brt->brt_vdevs[vdevid];
 	} else {
 		brtvd = NULL;
 	}
 
 	return (brtvd);
 }
 
 static void
 brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	char name[64];
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT0(brtvd->bv_mos_brtvdev);
 	ASSERT0(brtvd->bv_mos_entries);
 	ASSERT(brtvd->bv_entcount != NULL);
 	ASSERT(brtvd->bv_size > 0);
 	ASSERT(brtvd->bv_bitmap != NULL);
 	ASSERT(brtvd->bv_nblocks > 0);
 
 	brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
 	    ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
 	    brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
 	    0, tx);
 	VERIFY(brtvd->bv_mos_entries != 0);
 	BRT_DEBUG("MOS entries created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
 
 	/*
 	 * We allocate DMU buffer to store the bv_entcount[] array.
 	 * We will keep array size (bv_size) and cummulative count for all
 	 * bv_entcount[]s (bv_totalcount) in the bonus buffer.
 	 */
 	brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
 	    DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
 	    DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
 	VERIFY(brtvd->bv_mos_brtvdev != 0);
 	BRT_DEBUG("MOS BRT VDEV created, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
 	BRT_DEBUG("Pool directory object created, object=%s", name);
 
 	spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
 static void
 brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
 {
 	vdev_t *vd;
 	uint16_t *entcount;
 	ulong_t *bitmap;
 	uint64_t nblocks, size;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 
 	spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
 	size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
 	spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
 
 	entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
 	nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
 	bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
 
 	if (!brtvd->bv_initiated) {
 		ASSERT0(brtvd->bv_size);
 		ASSERT(brtvd->bv_entcount == NULL);
 		ASSERT(brtvd->bv_bitmap == NULL);
 		ASSERT0(brtvd->bv_nblocks);
 
 		avl_create(&brtvd->bv_tree, brt_entry_compare,
 		    sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
 	} else {
 		ASSERT(brtvd->bv_size > 0);
 		ASSERT(brtvd->bv_entcount != NULL);
 		ASSERT(brtvd->bv_bitmap != NULL);
 		ASSERT(brtvd->bv_nblocks > 0);
 		/*
 		 * TODO: Allow vdev shrinking. We only need to implement
 		 * shrinking the on-disk BRT VDEV object.
 		 * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
 		 *     size, tx);
 		 */
 		ASSERT3U(brtvd->bv_size, <=, size);
 
 		memcpy(entcount, brtvd->bv_entcount,
 		    sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
 		memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
 		    BT_SIZEOFMAP(brtvd->bv_nblocks)));
 		vmem_free(brtvd->bv_entcount,
 		    sizeof (entcount[0]) * brtvd->bv_size);
 		kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
 	}
 
 	brtvd->bv_size = size;
 	brtvd->bv_entcount = entcount;
 	brtvd->bv_bitmap = bitmap;
 	brtvd->bv_nblocks = nblocks;
 	if (!brtvd->bv_initiated) {
 		brtvd->bv_need_byteswap = FALSE;
 		brtvd->bv_initiated = TRUE;
 		BRT_DEBUG("BRT VDEV %llu initiated.",
 		    (u_longlong_t)brtvd->bv_vdevid);
 	}
 }
 
 static void
 brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
 {
 	char name[64];
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 	int error;
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
 	    sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
 	if (error != 0)
 		return;
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 
 	error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
 	ASSERT0(error);
 	if (error != 0)
 		return;
 
 	bvphys = db->db_data;
 	if (brt->brt_rangesize == 0) {
 		brt->brt_rangesize = bvphys->bvp_rangesize;
 	} else {
 		ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
 	}
 
 	ASSERT(!brtvd->bv_initiated);
 	brt_vdev_realloc(brt, brtvd);
 
 	/* TODO: We don't support VDEV shrinking. */
 	ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
 
 	/*
 	 * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
 	 */
 	error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
 	    MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
 	    brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
 	ASSERT0(error);
 
 	brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
 	ASSERT(brtvd->bv_mos_entries != 0);
 	brtvd->bv_need_byteswap =
 	    (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
 	brtvd->bv_totalcount = bvphys->bvp_totalcount;
 	brtvd->bv_usedspace = bvphys->bvp_usedspace;
 	brtvd->bv_savedspace = bvphys->bvp_savedspace;
 	brt->brt_usedspace += brtvd->bv_usedspace;
 	brt->brt_savedspace += brtvd->bv_savedspace;
 
 	dmu_buf_rele(db, FTAG);
 
 	BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
 	    name, (u_longlong_t)brtvd->bv_mos_brtvdev,
 	    (u_longlong_t)brtvd->bv_mos_entries);
 }
 
 static void
 brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
 {
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_initiated);
 
 	vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
 	brtvd->bv_entcount = NULL;
 	kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
 	brtvd->bv_bitmap = NULL;
 	ASSERT0(avl_numnodes(&brtvd->bv_tree));
 	avl_destroy(&brtvd->bv_tree);
 
 	brtvd->bv_size = 0;
 	brtvd->bv_nblocks = 0;
 
 	brtvd->bv_initiated = FALSE;
 	BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
 }
 
 static void
 brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	char name[64];
 	uint64_t count;
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 	ASSERT(brtvd->bv_mos_entries != 0);
 
 	VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
 	VERIFY0(count);
 	VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
 	BRT_DEBUG("MOS entries destroyed, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_entries);
 	brtvd->bv_mos_entries = 0;
 
 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
 	bvphys = db->db_data;
 	ASSERT0(bvphys->bvp_totalcount);
 	ASSERT0(bvphys->bvp_usedspace);
 	ASSERT0(bvphys->bvp_savedspace);
 	dmu_buf_rele(db, FTAG);
 
 	VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
 	BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
 	    (u_longlong_t)brtvd->bv_mos_brtvdev);
 	brtvd->bv_mos_brtvdev = 0;
 
 	snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
 	    (u_longlong_t)brtvd->bv_vdevid);
 	VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
 	BRT_DEBUG("Pool directory object removed, object=%s", name);
 
 	brt_vdev_dealloc(brt, brtvd);
 
 	spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
 }
 
 static void
 brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
 {
 	brt_vdev_t *brtvd, *vdevs;
 	uint64_t vdevid;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT3U(nvdevs, >, brt->brt_nvdevs);
 
 	vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
 	if (brt->brt_nvdevs > 0) {
 		ASSERT(brt->brt_vdevs != NULL);
 
 		memcpy(vdevs, brt->brt_vdevs,
 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
 		kmem_free(brt->brt_vdevs,
 		    sizeof (brt_vdev_t) * brt->brt_nvdevs);
 	}
 	for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
 		brtvd = &vdevs[vdevid];
 
 		brtvd->bv_vdevid = vdevid;
 		brtvd->bv_initiated = FALSE;
 	}
 
 	BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
 	    (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
 
 	brt->brt_vdevs = vdevs;
 	brt->brt_nvdevs = nvdevs;
 }
 
 static boolean_t
 brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
 {
 	uint64_t idx;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 
 	idx = bre->bre_offset / brt->brt_rangesize;
 	if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
 		/* VDEV wasn't expanded. */
 		return (brt_vdev_entcount_get(brtvd, idx) > 0);
 	}
 
 	return (FALSE);
 }
 
 static void
 brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
     uint64_t dsize)
 {
 	uint64_t idx;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 	ASSERT(brtvd != NULL);
 	ASSERT(brtvd->bv_entcount != NULL);
 
 	brt->brt_savedspace += dsize;
 	brtvd->bv_savedspace += dsize;
 	brtvd->bv_meta_dirty = TRUE;
 
 	if (bre->bre_refcount > 1) {
 		return;
 	}
 
 	brt->brt_usedspace += dsize;
 	brtvd->bv_usedspace += dsize;
 
 	idx = bre->bre_offset / brt->brt_rangesize;
 	if (idx >= brtvd->bv_size) {
 		/* VDEV has been expanded. */
 		brt_vdev_realloc(brt, brtvd);
 	}
 
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	brtvd->bv_totalcount++;
 	brt_vdev_entcount_inc(brtvd, idx);
 	brtvd->bv_entcount_dirty = TRUE;
 	idx = idx / BRT_BLOCKSIZE / 8;
 	BT_SET(brtvd->bv_bitmap, idx);
 
 #ifdef ZFS_DEBUG
 	brt_vdev_dump(brt);
 #endif
 }
 
 static void
 brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
     uint64_t dsize)
 {
 	uint64_t idx;
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd != NULL);
 	ASSERT(brtvd->bv_entcount != NULL);
 
 	brt->brt_savedspace -= dsize;
 	brtvd->bv_savedspace -= dsize;
 	brtvd->bv_meta_dirty = TRUE;
 
 	if (bre->bre_refcount > 0) {
 		return;
 	}
 
 	brt->brt_usedspace -= dsize;
 	brtvd->bv_usedspace -= dsize;
 
 	idx = bre->bre_offset / brt->brt_rangesize;
 	ASSERT3U(idx, <, brtvd->bv_size);
 
 	ASSERT(brtvd->bv_totalcount > 0);
 	brtvd->bv_totalcount--;
 	brt_vdev_entcount_dec(brtvd, idx);
 	brtvd->bv_entcount_dirty = TRUE;
 	idx = idx / BRT_BLOCKSIZE / 8;
 	BT_SET(brtvd->bv_bitmap, idx);
 
 #ifdef ZFS_DEBUG
 	brt_vdev_dump(brt);
 #endif
 }
 
 static void
 brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	brt_vdev_phys_t *bvphys;
 
 	ASSERT(brtvd->bv_meta_dirty);
 	ASSERT(brtvd->bv_mos_brtvdev != 0);
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
 
 	if (brtvd->bv_entcount_dirty) {
 		/*
 		 * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
 		 */
 		dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
 		    brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
 		    brtvd->bv_entcount, tx);
 		memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
 		brtvd->bv_entcount_dirty = FALSE;
 	}
 
 	dmu_buf_will_dirty(db, tx);
 	bvphys = db->db_data;
 	bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
 	bvphys->bvp_size = brtvd->bv_size;
 	if (brtvd->bv_need_byteswap) {
 		bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
 	} else {
 		bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
 	}
 	bvphys->bvp_totalcount = brtvd->bv_totalcount;
 	bvphys->bvp_rangesize = brt->brt_rangesize;
 	bvphys->bvp_usedspace = brtvd->bv_usedspace;
 	bvphys->bvp_savedspace = brtvd->bv_savedspace;
 	dmu_buf_rele(db, FTAG);
 
 	brtvd->bv_meta_dirty = FALSE;
 }
 
 static void
 brt_vdevs_alloc(brt_t *brt, boolean_t load)
 {
 	brt_vdev_t *brtvd;
 	uint64_t vdevid;
 
 	brt_wlock(brt);
 
 	brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
 
 	if (load) {
 		for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 			brtvd = &brt->brt_vdevs[vdevid];
 			ASSERT(brtvd->bv_entcount == NULL);
 
 			brt_vdev_load(brt, brtvd);
 		}
 	}
 
 	if (brt->brt_rangesize == 0) {
 		brt->brt_rangesize = BRT_RANGESIZE;
 	}
 
 	brt_unlock(brt);
 }
 
 static void
 brt_vdevs_free(brt_t *brt)
 {
 	brt_vdev_t *brtvd;
 	uint64_t vdevid;
 
 	brt_wlock(brt);
 
 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brtvd = &brt->brt_vdevs[vdevid];
 		if (brtvd->bv_initiated)
 			brt_vdev_dealloc(brt, brtvd);
 	}
 	kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
 
 	brt_unlock(brt);
 }
 
 static void
 brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
 {
 
 	bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
 	bre->bre_refcount = 0;
 
 	*vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
 }
 
 static int
 brt_entry_compare(const void *x1, const void *x2)
 {
 	const brt_entry_t *bre1 = x1;
 	const brt_entry_t *bre2 = x2;
 
 	return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
 }
 
 static int
 brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
 {
 	uint64_t mos_entries;
 	uint64_t one, physsize;
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 
 	if (!brt_vdev_lookup(brt, brtvd, bre))
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * Remember mos_entries object number. After we reacquire the BRT lock,
 	 * the brtvd pointer may be invalid.
 	 */
 	mos_entries = brtvd->bv_mos_entries;
 	if (mos_entries == 0)
 		return (SET_ERROR(ENOENT));
 
 	brt_unlock(brt);
 
 	error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
 	    BRT_KEY_WORDS, &one, &physsize);
 	if (error == 0) {
 		ASSERT3U(one, ==, 1);
 		ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
 
 		error = zap_lookup_uint64(brt->brt_mos, mos_entries,
 		    &bre->bre_offset, BRT_KEY_WORDS, 1,
 		    sizeof (bre->bre_refcount), &bre->bre_refcount);
 		BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
 		    "count=%llu error=%d", (u_longlong_t)mos_entries,
 		    (u_longlong_t)brtvd->bv_vdevid,
 		    (u_longlong_t)bre->bre_offset,
 		    error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
 	}
 
 	brt_wlock(brt);
 
 	return (error);
 }
 
 static void
 brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
 {
 	brt_vdev_t *brtvd;
 	uint64_t mos_entries = 0;
 
 	brt_rlock(brt);
 	brtvd = brt_vdev(brt, vdevid);
 	if (brtvd != NULL)
 		mos_entries = brtvd->bv_mos_entries;
 	brt_unlock(brt);
 
 	if (mos_entries == 0)
 		return;
 
 	BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
 	    (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
 	    (u_longlong_t)bre->bre_offset);
 	(void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
 }
 
 static int
 brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_mos_entries != 0);
 	ASSERT(bre->bre_refcount > 0);
 
 	error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
 	    sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
 	BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
 	    (u_longlong_t)bre->bre_refcount, error);
 
 	return (error);
 }
 
 static int
 brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
 {
 	int error;
 
 	ASSERT(RW_LOCK_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_mos_entries != 0);
 	ASSERT0(bre->bre_refcount);
 
 	error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
 	    (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
 	BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
 	    "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
 	    (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
 	    (u_longlong_t)bre->bre_refcount, error);
 
 	return (error);
 }
 
 /*
  * Return TRUE if we _can_ have BRT entry for this bp. It might be false
  * positive, but gives us quick answer if we should look into BRT, which
  * may require reads and thus will be more expensive.
  */
 boolean_t
 brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_vdev_t *brtvd;
 	brt_entry_t bre_search;
 	boolean_t mayexists = FALSE;
 	uint64_t vdevid;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_rlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	if (brtvd != NULL && brtvd->bv_initiated) {
 		if (!avl_is_empty(&brtvd->bv_tree) ||
 		    brt_vdev_lookup(brt, brtvd, &bre_search)) {
 			mayexists = TRUE;
 		}
 	}
 
 	brt_unlock(brt);
 
 	return (mayexists);
 }
 
 uint64_t
 brt_get_dspace(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return (0);
 
 	return (brt->brt_savedspace);
 }
 
 uint64_t
 brt_get_used(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return (0);
 
 	return (brt->brt_usedspace);
 }
 
 uint64_t
 brt_get_saved(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return (0);
 
 	return (brt->brt_savedspace);
 }
 
 uint64_t
 brt_get_ratio(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt->brt_usedspace == 0)
 		return (100);
 
 	return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
 	    brt->brt_usedspace);
 }
 
 static int
 brt_kstats_update(kstat_t *ksp, int rw)
 {
 	brt_stats_t *bs = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	bs->brt_addref_entry_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_in_memory);
 	bs->brt_addref_entry_not_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
 	bs->brt_addref_entry_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_on_disk);
 	bs->brt_addref_entry_read_lost_race.value.ui64 =
 	    wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
 	bs->brt_decref_entry_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_in_memory);
 	bs->brt_decref_entry_loaded_from_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
 	bs->brt_decref_entry_not_in_memory.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
 	bs->brt_decref_entry_not_on_disk.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
 	bs->brt_decref_entry_read_lost_race.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
 	bs->brt_decref_entry_still_referenced.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
 	bs->brt_decref_free_data_later.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_free_data_later);
 	bs->brt_decref_free_data_now.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_free_data_now);
 	bs->brt_decref_no_entry.value.ui64 =
 	    wmsum_value(&brt_sums.brt_decref_no_entry);
 
 	return (0);
 }
 
 static void
 brt_stat_init(void)
 {
 
 	wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
 	wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
 	wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
 	wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
 	wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
 	wmsum_init(&brt_sums.brt_decref_no_entry, 0);
 
 	brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (brt_ksp != NULL) {
 		brt_ksp->ks_data = &brt_stats;
 		brt_ksp->ks_update = brt_kstats_update;
 		kstat_install(brt_ksp);
 	}
 }
 
 static void
 brt_stat_fini(void)
 {
 	if (brt_ksp != NULL) {
 		kstat_delete(brt_ksp);
 		brt_ksp = NULL;
 	}
 
 	wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
 	wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
 	wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
 	wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
 	wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
 	wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
 	wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
 	wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
 	wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
 	wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
 	wmsum_fini(&brt_sums.brt_decref_free_data_later);
 	wmsum_fini(&brt_sums.brt_decref_free_data_now);
 	wmsum_fini(&brt_sums.brt_decref_no_entry);
 }
 
 void
 brt_init(void)
 {
 	brt_entry_cache = kmem_cache_create("brt_entry_cache",
 	    sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
 	    sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	brt_stat_init();
 }
 
 void
 brt_fini(void)
 {
 	brt_stat_fini();
 
 	kmem_cache_destroy(brt_entry_cache);
 	kmem_cache_destroy(brt_pending_entry_cache);
 }
 
 static brt_entry_t *
 brt_entry_alloc(const brt_entry_t *bre_init)
 {
 	brt_entry_t *bre;
 
 	bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
 	bre->bre_offset = bre_init->bre_offset;
 	bre->bre_refcount = bre_init->bre_refcount;
 
 	return (bre);
 }
 
 static void
 brt_entry_free(brt_entry_t *bre)
 {
 
 	kmem_cache_free(brt_entry_cache, bre);
 }
 
 static void
 brt_entry_addref(brt_t *brt, const blkptr_t *bp)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre, *racebre;
 	brt_entry_t bre_search;
 	avl_index_t where;
 	uint64_t vdevid;
 	int error;
 
 	ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_wlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	if (brtvd == NULL) {
 		ASSERT3U(vdevid, >=, brt->brt_nvdevs);
 
 		/* New VDEV was added. */
 		brt_vdevs_expand(brt, vdevid + 1);
 		brtvd = brt_vdev(brt, vdevid);
 	}
 	ASSERT(brtvd != NULL);
 	if (!brtvd->bv_initiated)
 		brt_vdev_realloc(brt, brtvd);
 
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre != NULL) {
 		BRTSTAT_BUMP(brt_addref_entry_in_memory);
 	} else {
 		/*
 		 * brt_entry_lookup() may drop the BRT (read) lock and
 		 * reacquire it (write).
 		 */
 		error = brt_entry_lookup(brt, brtvd, &bre_search);
 		/* bre_search now contains correct bre_refcount */
 		ASSERT(error == 0 || error == ENOENT);
 		if (error == 0)
 			BRTSTAT_BUMP(brt_addref_entry_on_disk);
 		else
 			BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
 		/*
 		 * When the BRT lock was dropped, brt_vdevs[] may have been
 		 * expanded and reallocated, we need to update brtvd's pointer.
 		 */
 		brtvd = brt_vdev(brt, vdevid);
 		ASSERT(brtvd != NULL);
 
 		racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
 		if (racebre == NULL) {
 			bre = brt_entry_alloc(&bre_search);
 			ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 			avl_insert(&brtvd->bv_tree, bre, where);
 			brt->brt_nentries++;
 		} else {
 			/*
 			 * The entry was added when the BRT lock was dropped in
 			 * brt_entry_lookup().
 			 */
 			BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
 			bre = racebre;
 		}
 	}
 	bre->bre_refcount++;
 	brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
 
 	brt_unlock(brt);
 }
 
 /* Return TRUE if block should be freed immediately. */
 boolean_t
 brt_entry_decref(spa_t *spa, const blkptr_t *bp)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre, *racebre;
 	brt_entry_t bre_search;
 	avl_index_t where;
 	uint64_t vdevid;
 	int error;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_wlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	ASSERT(brtvd != NULL);
 
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre != NULL) {
 		BRTSTAT_BUMP(brt_decref_entry_in_memory);
 		goto out;
 	} else {
 		BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
 	}
 
 	/*
 	 * brt_entry_lookup() may drop the BRT lock and reacquire it.
 	 */
 	error = brt_entry_lookup(brt, brtvd, &bre_search);
 	/* bre_search now contains correct bre_refcount */
 	ASSERT(error == 0 || error == ENOENT);
 	/*
 	 * When the BRT lock was dropped, brt_vdevs[] may have been expanded
 	 * and reallocated, we need to update brtvd's pointer.
 	 */
 	brtvd = brt_vdev(brt, vdevid);
 	ASSERT(brtvd != NULL);
 
 	if (error == ENOENT) {
 		BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
 		bre = NULL;
 		goto out;
 	}
 
 	racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
 	if (racebre != NULL) {
 		/*
 		 * The entry was added when the BRT lock was dropped in
 		 * brt_entry_lookup().
 		 */
 		BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
 		bre = racebre;
 		goto out;
 	}
 
 	BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
 	bre = brt_entry_alloc(&bre_search);
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	avl_insert(&brtvd->bv_tree, bre, where);
 	brt->brt_nentries++;
 
 out:
 	if (bre == NULL) {
 		/*
 		 * This is a free of a regular (not cloned) block.
 		 */
 		brt_unlock(brt);
 		BRTSTAT_BUMP(brt_decref_no_entry);
 		return (B_TRUE);
 	}
 	if (bre->bre_refcount == 0) {
 		brt_unlock(brt);
 		BRTSTAT_BUMP(brt_decref_free_data_now);
 		return (B_TRUE);
 	}
 
 	ASSERT(bre->bre_refcount > 0);
 	bre->bre_refcount--;
 	if (bre->bre_refcount == 0)
 		BRTSTAT_BUMP(brt_decref_free_data_later);
 	else
 		BRTSTAT_BUMP(brt_decref_entry_still_referenced);
 	brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
 
 	brt_unlock(brt);
 
 	return (B_FALSE);
 }
 
 uint64_t
 brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp)
 {
 	brt_t *brt = spa->spa_brt;
 	brt_vdev_t *brtvd;
 	brt_entry_t bre_search, *bre;
 	uint64_t vdevid, refcnt;
 	int error;
 
 	brt_entry_fill(bp, &bre_search, &vdevid);
 
 	brt_rlock(brt);
 
 	brtvd = brt_vdev(brt, vdevid);
 	ASSERT(brtvd != NULL);
 
 	bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
 	if (bre == NULL) {
 		error = brt_entry_lookup(brt, brtvd, &bre_search);
 		ASSERT(error == 0 || error == ENOENT);
 		if (error == ENOENT)
 			refcnt = 0;
 		else
 			refcnt = bre_search.bre_refcount;
 	} else
 		refcnt = bre->bre_refcount;
 
 	brt_unlock(brt);
 	return (refcnt);
 }
 
 static void
 brt_prefetch(brt_t *brt, const blkptr_t *bp)
 {
 	brt_entry_t bre;
 	uint64_t vdevid;
 
 	ASSERT(bp != NULL);
 
 	if (!zfs_brt_prefetch)
 		return;
 
 	brt_entry_fill(bp, &bre, &vdevid);
 
 	brt_entry_prefetch(brt, vdevid, &bre);
 }
 
 static int
 brt_pending_entry_compare(const void *x1, const void *x2)
 {
 	const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
 	const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
 	int cmp;
 
 	cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
 	if (cmp == 0) {
 		cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
 		    DVA_GET_VDEV(&bp2->blk_dva[0]));
 		if (cmp == 0) {
 			cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
 			    DVA_GET_OFFSET(&bp2->blk_dva[0]));
 		}
 	}
 
 	return (cmp);
 }
 
 void
 brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	brt_t *brt;
 	avl_tree_t *pending_tree;
 	kmutex_t *pending_lock;
 	brt_pending_entry_t *bpe, *newbpe;
 	avl_index_t where;
 	uint64_t txg;
 
 	brt = spa->spa_brt;
 	txg = dmu_tx_get_txg(tx);
 	ASSERT3U(txg, !=, 0);
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
 
 	newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
 	newbpe->bpe_bp = *bp;
 	newbpe->bpe_count = 1;
 
 	mutex_enter(pending_lock);
 
 	bpe = avl_find(pending_tree, newbpe, &where);
 	if (bpe == NULL) {
 		avl_insert(pending_tree, newbpe, where);
 		newbpe = NULL;
 	} else {
 		bpe->bpe_count++;
 	}
 
 	mutex_exit(pending_lock);
 
 	if (newbpe != NULL) {
 		ASSERT(bpe != NULL);
 		ASSERT(bpe != newbpe);
 		kmem_cache_free(brt_pending_entry_cache, newbpe);
 	} else {
 		ASSERT(bpe == NULL);
 	}
 
 	/* Prefetch BRT entry, as we will need it in the syncing context. */
 	brt_prefetch(brt, bp);
 }
 
 void
 brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	brt_t *brt;
 	avl_tree_t *pending_tree;
 	kmutex_t *pending_lock;
 	brt_pending_entry_t *bpe, bpe_search;
 	uint64_t txg;
 
 	brt = spa->spa_brt;
 	txg = dmu_tx_get_txg(tx);
 	ASSERT3U(txg, !=, 0);
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
 
 	bpe_search.bpe_bp = *bp;
 
 	mutex_enter(pending_lock);
 
 	bpe = avl_find(pending_tree, &bpe_search, NULL);
 	/* I believe we should always find bpe when this function is called. */
 	if (bpe != NULL) {
 		ASSERT(bpe->bpe_count > 0);
 
 		bpe->bpe_count--;
 		if (bpe->bpe_count == 0) {
 			avl_remove(pending_tree, bpe);
 			kmem_cache_free(brt_pending_entry_cache, bpe);
 		}
 	}
 
 	mutex_exit(pending_lock);
 }
 
 void
 brt_pending_apply(spa_t *spa, uint64_t txg)
 {
 	brt_t *brt;
 	brt_pending_entry_t *bpe;
 	avl_tree_t *pending_tree;
 	kmutex_t *pending_lock;
 	void *c;
 
 	ASSERT3U(txg, !=, 0);
 
 	brt = spa->spa_brt;
 	pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
 	pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
 
 	mutex_enter(pending_lock);
 
 	c = NULL;
 	while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
 		boolean_t added_to_ddt;
 
 		mutex_exit(pending_lock);
 
 		for (int i = 0; i < bpe->bpe_count; i++) {
 			/*
 			 * If the block has DEDUP bit set, it means that it
 			 * already exists in the DEDUP table, so we can just
 			 * use that instead of creating new entry in
 			 * the BRT table.
 			 */
 			if (BP_GET_DEDUP(&bpe->bpe_bp)) {
 				added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
 			} else {
 				added_to_ddt = B_FALSE;
 			}
 			if (!added_to_ddt)
 				brt_entry_addref(brt, &bpe->bpe_bp);
 		}
 
 		kmem_cache_free(brt_pending_entry_cache, bpe);
 		mutex_enter(pending_lock);
 	}
 
 	mutex_exit(pending_lock);
 }
 
 static void
 brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
 {
 
 	ASSERT(RW_WRITE_HELD(&brt->brt_lock));
 	ASSERT(brtvd->bv_mos_entries != 0);
 
 	if (bre->bre_refcount == 0) {
 		int error;
 
 		error = brt_entry_remove(brt, brtvd, bre, tx);
 		ASSERT(error == 0 || error == ENOENT);
 		/*
 		 * If error == ENOENT then zfs_clone_range() was done from a
 		 * removed (but opened) file (open(), unlink()).
 		 */
 		ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
 	} else {
 		VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
 	}
 }
 
 static void
 brt_sync_table(brt_t *brt, dmu_tx_t *tx)
 {
 	brt_vdev_t *brtvd;
 	brt_entry_t *bre;
 	uint64_t vdevid;
 	void *c;
 
 	brt_wlock(brt);
 
 	for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
 		brtvd = &brt->brt_vdevs[vdevid];
 
 		if (!brtvd->bv_initiated)
 			continue;
 
 		if (!brtvd->bv_meta_dirty) {
 			ASSERT(!brtvd->bv_entcount_dirty);
 			ASSERT0(avl_numnodes(&brtvd->bv_tree));
 			continue;
 		}
 
 		ASSERT(!brtvd->bv_entcount_dirty ||
 		    avl_numnodes(&brtvd->bv_tree) != 0);
 
 		if (brtvd->bv_mos_brtvdev == 0)
 			brt_vdev_create(brt, brtvd, tx);
 
 		c = NULL;
 		while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
 			brt_sync_entry(brt, brtvd, bre, tx);
 			brt_entry_free(bre);
 			ASSERT(brt->brt_nentries > 0);
 			brt->brt_nentries--;
 		}
 
 		brt_vdev_sync(brt, brtvd, tx);
 
 		if (brtvd->bv_totalcount == 0)
 			brt_vdev_destroy(brt, brtvd, tx);
 	}
 
 	ASSERT0(brt->brt_nentries);
 
 	brt_unlock(brt);
 }
 
 void
 brt_sync(spa_t *spa, uint64_t txg)
 {
 	dmu_tx_t *tx;
 	brt_t *brt;
 
 	ASSERT(spa_syncing_txg(spa) == txg);
 
 	brt = spa->spa_brt;
 	brt_rlock(brt);
 	if (brt->brt_nentries == 0) {
 		/* No changes. */
 		brt_unlock(brt);
 		return;
 	}
 	brt_unlock(brt);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	brt_sync_table(brt, tx);
 
 	dmu_tx_commit(tx);
 }
 
 static void
 brt_table_alloc(brt_t *brt)
 {
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		avl_create(&brt->brt_pending_tree[i],
 		    brt_pending_entry_compare,
 		    sizeof (brt_pending_entry_t),
 		    offsetof(brt_pending_entry_t, bpe_node));
 		mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
 		    NULL);
 	}
 }
 
 static void
 brt_table_free(brt_t *brt)
 {
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
 
 		avl_destroy(&brt->brt_pending_tree[i]);
 		mutex_destroy(&brt->brt_pending_lock[i]);
 	}
 }
 
 static void
 brt_alloc(spa_t *spa)
 {
 	brt_t *brt;
 
 	ASSERT(spa->spa_brt == NULL);
 
 	brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
 	rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
 	brt->brt_spa = spa;
 	brt->brt_rangesize = 0;
 	brt->brt_nentries = 0;
 	brt->brt_vdevs = NULL;
 	brt->brt_nvdevs = 0;
 	brt_table_alloc(brt);
 
 	spa->spa_brt = brt;
 }
 
 void
 brt_create(spa_t *spa)
 {
 
 	brt_alloc(spa);
 	brt_vdevs_alloc(spa->spa_brt, B_FALSE);
 }
 
 int
 brt_load(spa_t *spa)
 {
 
 	brt_alloc(spa);
 	brt_vdevs_alloc(spa->spa_brt, B_TRUE);
 
 	return (0);
 }
 
 void
 brt_unload(spa_t *spa)
 {
 	brt_t *brt = spa->spa_brt;
 
 	if (brt == NULL)
 		return;
 
 	brt_vdevs_free(brt);
 	brt_table_free(brt);
 	rw_destroy(&brt->brt_lock);
 	kmem_free(brt, sizeof (*brt));
 	spa->spa_brt = NULL;
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
     "Enable prefetching of BRT entries");
 #ifdef ZFS_BRT_DEBUG
 ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
 #endif
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/dmu.c b/sys/contrib/openzfs/module/zfs/dmu.c
index a63aac51f225..e0cdd9e3f33e 100644
--- a/sys/contrib/openzfs/module/zfs/dmu.c
+++ b/sys/contrib/openzfs/module/zfs/dmu.c
@@ -1,2595 +1,2593 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 static int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
  * one TXG. After this threshold is crossed, additional dirty blocks from frees
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 static uint_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * Enable/disable forcing txg sync when dirty checking for holes with lseek().
  * By default this is enabled to ensure accurate hole reporting, it can result
  * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
  * Disabling this option will result in holes never being reported in dirty
  * files which is always safe.
  */
 static int zfs_dmu_offset_next_sync = 1;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
 #ifdef _ILP32
 uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 #else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
 	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
 	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
 	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
 	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
 	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
 	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else if (newsize < 0 || newsize > db_fake->db_size) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (!DMU_OT_IS_VALID(type)) {
 		error = SET_ERROR(EINVAL);
 	} else if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	type = dn->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     uint32_t flags)
 {
 	dmu_buf_impl_t *db;
 	int error;
 	uint32_t db_flags = DB_RF_MUST_SUCCEED;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	error = dbuf_read(db, NULL, db_flags);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
 	dnode_rele(dn, FTAG);
 
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 	uint32_t db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_DECRYPT)
 		db_flags |= DB_RF_NO_DECRYPT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
     uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	zstream_t *zs = NULL;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
 	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	if ((flags & DMU_READ_NO_DECRYPT) != 0)
 		dbuf_flags |= DB_RF_NO_DECRYPT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	if (read)
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
 		 * base predictions on the original less racy request order.
 		 */
 		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks, read,
 		    B_TRUE);
 	}
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			if (zs)
 				dmu_zfetch_run(zs, missed, B_TRUE);
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
 				zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/*
 		 * Initiate async demand data read.
 		 * We check the db_state after calling dbuf_read() because
 		 * (1) dbuf_read() may change the state to CACHED due to a
 		 * hit in the ARC, and (2) on a cache miss, a child will
 		 * have been added to "zio" but not yet completed, so the
 		 * state will not yet be CACHED.
 		 */
 		if (read) {
 			if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
 			    offset + length < db->db.db_offset +
 			    db->db.db_size) {
 				if (offset <= db->db.db_offset)
 					dbuf_flags |= DB_RF_PARTIAL_FIRST;
 				else
 					dbuf_flags |= DB_RF_PARTIAL_MORE;
 			}
 			(void) dbuf_read(db, zio, dbuf_flags);
 			if (db->db_state != DB_CACHED)
 				missed = B_TRUE;
 		}
 		dbp[i] = &db->db;
 	}
 
 	if (!read)
 		zfs_racct_write(length, nblks);
 
 	if (zs)
 		dmu_zfetch_run(zs, missed, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
 		/* wait for async read i/o */
 		err = zio_wait(zio);
 		if (err) {
 			dmu_buf_rele_array(dbp, nblks, tag);
 			return (err);
 		}
 
 		/* wait for other io to complete */
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.  If the range
  * it too long, prefetch the first dmu_prefetch_max bytes as requested, while
  * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
  * should primarily help random reads, since for long sequential reads there is
  * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
  * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	int64_t level2 = level;
 	uint64_t start, end, start2, end2;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift != 0) {
 		/*
 		 * The object has multiple blocks.  Calculate the full range
 		 * of blocks [start, end2) and then split it into two parts,
 		 * so that the first [start, end) fits into dmu_prefetch_max.
 		 */
 		start = dbuf_whichblock(dn, level, offset);
 		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
 		uint8_t ibs = dn->dn_indblkshift;
 		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
 		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
 		start2 = end = MIN(end2, start + limit);
 
 		/*
 		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
 			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
 		} while (end2 - start2 > limit);
 	} else {
 		/* There is only one block.  Prefetch it or nothing. */
 		start = start2 = end2 = 0;
 		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
 	for (uint64_t i = start; i < end; i++)
 		dbuf_prefetch(dn, level, i, pri, 0);
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * Issue prefetch I/Os for the given object's dnode.
  */
 void
 dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
 {
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return;
 
 	dnode_t *dn = DMU_META_DNODE(os);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
 	dbuf_prefetch(dn, 0, blkid, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed and l1blks is set to the number of level 1
  * indirect blocks found within the chunk.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
 	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
 	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
 	 * worst case value as an estimate so we can avoid having to look
 	 * at the object's actual data.
 	 */
 	uint64_t total_l1blks =
 	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
 	    iblkrange;
 	if (total_l1blks <= maxblks) {
 		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			*l1blks = blks;
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	*l1blks = blks;
 
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #else
 	(void) os;
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		uint64_t txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&dp->dp_lock);
 		uint64_t long_free_dirty =
 		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees, wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees.
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
 			dmu_tx_commit(tx);
 			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}
 
 		/*
 		 * In order to prevent unnecessary write throttling, for each
 		 * TXG, we track the cumulative size of L1 blocks being dirtied
 		 * in dnode_free_range() below. We compare this number to a
 		 * tunable threshold, past which we prevent new L1 dirty freeing
 		 * blocks from being added into the open TXG. See
 		 * dmu_free_long_range_impl() for details. The threshold
 		 * prevents write throttle activation due to dirty freeing L1
 		 * blocks taking up a large percentage of zfs_dirty_data_max.
 		 */
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
 		    l1blks << dn->dn_indblkshift;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty, uint64_t, chunk_len,
 		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err == 0) {
 		err = dmu_object_free(os, object, tx);
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		memset((char *)buf + newsz, 0, size - newsz);
 		size = newsz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 /*
  * Note: Lustre is an external consumer of this interface.
  */
 void
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	int numbufs, i;
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 	    &numbufs, &dbp));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = zfs_uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
 		    UIO_READ, uio);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_read_uio_dnode(dn, uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
 
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = zfs_uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * XXX zfs_uiomove could block forever (eg.nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
 		 * to lock the pages in memory, so that zfs_uiomove won't
 		 * block.
 		 */
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * A "lightweight" write is faster than a regular write (e.g.
  * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
  * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t.  However, the
  * data can not be read or overwritten until the transaction's txg has been
  * synced.  This makes it appropriate for workloads that are known to be
  * (temporarily) write-only, like "zfs receive".
  *
  * A single block is written, starting at the specified offset in bytes.  If
  * the call is successful, it returns 0 and the provided abd has been
  * consumed (the caller should not free it).
  */
 int
 dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
     const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr =
 	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
 	if (dr == NULL)
 		return (SET_ERROR(EIO));
 	dr->dt.dll.dr_abd = abd;
 	dr->dt.dll.dr_props = *zp;
 	dr->dt.dll.dr_flags = flags;
 	return (0);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 int
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint64_t object = dn->dn_object;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
 	 * same size as the dbuf.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		zfs_racct_write(blksz, 1);
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 	}
 
 	return (0);
 }
 
 int
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	int err;
 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(dbuf);
 	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx);
 	DB_DNODE_EXIT(dbuf);
 
 	return (err);
 }
 
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
 	zgd_t			*dsa_zgd;
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(bp)) {
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	/*
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
 	if (zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	if (zio->io_error == 0) {
 		/*
 		 * Record the vdev(s) backing this blkptr so they can be
 		 * flushed after the writes for the lwb have completed.
 		 */
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 		if (!BP_IS_HOLE(bp)) {
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_free(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 	int error;
 
 	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
 	    DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	if (error != 0)
 		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	/*
 	 * This transaction does not produce any dirty data or log blocks, so
 	 * it should not be throttled.  All other cases wait for TXG sync, by
 	 * which time the log block we are writing will be obsolete, so we can
 	 * skip waiting and just return error here instead.
 	 */
 	if (dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr, *dr_next;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = dbuf_find_dirty_eq(db, txg);
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	dr_next = list_next(&db->db_dirty_records, dr);
 	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
 		zp.zp_nopwrite = B_FALSE;
 	DB_DNODE_EXIT(db);
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
 	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db),
 	    &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 static const int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	uint8_t complevel = os->os_complevel;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zio_compress_select(os->os_spa,
 		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		switch (os->os_redundant_metadata) {
 		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_MOST:
 			if (level >= zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_SOME:
 			if (DMU_OT_IS_CRITICAL(type))
 				copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 		complevel = zio_complevel_select(os->os_spa, compress,
 		    complevel, complevel);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checksum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (level <= 0 &&
 		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
 			compress = ZIO_COMPRESS_EMPTY;
 		}
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_complevel = complevel;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * Reports the location of data and holes in an object.  In order to
  * accurately report holes all dirty data must be synced to disk.  This
  * causes extremely poor performance when seeking for holes in a dirty file.
  * As a compromise, only provide hole data when the dnode is clean.  When
  * a dnode is dirty report the dnode as having no holes by returning EBUSY
  * which is always safe to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int restarted = 0, err;
 
 restart:
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dnode_is_dirty(dn)) {
 		/*
 		 * If the zfs_dmu_offset_next_sync module option is enabled
 		 * then hole reporting has been requested.  Dirty dnodes
 		 * must be synced to disk to accurately report holes.
 		 *
 		 * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
 		 * held by the caller only a single restart will be required.
 		 * We tolerate callers which do not hold the rangelock by
 		 * returning EBUSY and not reporting holes after one restart.
 		 */
 		if (zfs_dmu_offset_next_sync) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
 
 			if (restarted)
 				return (SET_ERROR(EBUSY));
 
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			restarted = 1;
 			goto restart;
 		}
 
 		err = SET_ERROR(EBUSY);
 	} else {
 		err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     blkptr_t *bps, size_t *nbpsp)
 {
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	blkptr_t *bp;
 	int error, numbufs;
 
 	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp);
 	if (error != 0) {
 		if (error == ESRCH) {
 			error = SET_ERROR(ENXIO);
 		}
 		return (error);
 	}
 
 	ASSERT3U(numbufs, <=, *nbpsp);
 
 	for (int i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 
 		mutex_enter(&db->db_mtx);
 
 		if (!list_is_empty(&db->db_dirty_records)) {
 			dbuf_dirty_record_t *dr;
 
 			dr = list_head(&db->db_dirty_records);
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * This is very special case where we clone a
 				 * block and in the same transaction group we
 				 * read its BP (most likely to clone the clone).
 				 */
 				bp = &dr->dt.dl.dr_overridden_by;
 			} else {
 				/*
 				 * The block was modified in the same
 				 * transaction group.
 				 */
 				mutex_exit(&db->db_mtx);
 				error = SET_ERROR(EAGAIN);
 				goto out;
 			}
 		} else {
 			bp = db->db_blkptr;
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		if (bp == NULL) {
 			/*
 			 * The block was created in this transaction group,
 			 * so it has no BP yet.
 			 */
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
 		/*
 		 * Make sure we clone only data blocks.
 		 */
 		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		bps[i] = *bp;
 	}
 
 	*nbpsp = numbufs;
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
-    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 	const blkptr_t *bp;
 	int error = 0, i, numbufs;
 
 	spa = os->os_spa;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp));
 	ASSERT3U(nbps, ==, numbufs);
 
 	/*
 	 * Before we start cloning make sure that the dbufs sizes match new BPs
 	 * sizes. If they don't, that's a no-go, as we are not able to shrink
 	 * dbufs.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 
 		if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
 			error = SET_ERROR(EXDEV);
 			goto out;
 		}
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 		ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
 
 		dmu_buf_will_clone(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
 
 		dr = list_head(&db->db_dirty_records);
 		VERIFY(dr != NULL);
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		dl->dr_overridden_by = *bp;
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 		if (BP_IS_HOLE(bp)) {
 			dl->dr_overridden_by.blk_birth = 0;
 			dl->dr_overridden_by.blk_phys_birth = 0;
 		} else {
 			dl->dr_overridden_by.blk_birth = dr->dr_txg;
 			if (!BP_IS_EMBEDDED(bp)) {
 				dl->dr_overridden_by.blk_phys_birth =
 				    BP_PHYSICAL_BIRTH(bp);
 			}
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
-		 * Also, when replaying ZIL we don't want to bump references
-		 * in the BRT as it was already done during ZIL claim.
 		 */
-		if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	*dnsize = dn->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 	(void) vbuf, (void) size;
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_maxblkid);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
 EXPORT_SYMBOL(dmu_offset_next);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
 /* CSTYLED */
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
diff --git a/sys/contrib/openzfs/module/zfs/zfs_replay.c b/sys/contrib/openzfs/module/zfs/zfs_replay.c
index 09c7be853bf9..2e0af60f6db4 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_replay.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_replay.c
@@ -1,1223 +1,1259 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 Cyril Plisko. All rights reserved.
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/sysmacros.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/thread.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_vnops.h>
 #include <sys/spa.h>
 #include <sys/zil.h>
 #include <sys/byteorder.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 #include <sys/atomic.h>
 #include <sys/cred.h>
 #include <sys/zpl.h>
 #include <sys/dmu_objset.h>
 #include <sys/zfeature.h>
 
 /*
  * NB: FreeBSD expects to be able to do vnode locking in lookup and
  * hold the locks across all subsequent VOPs until vput is called.
  * This means that its zfs vnops routines can't do any internal locking.
  * In order to have the same contract as the Linux vnops there would
  * needed to be duplicate locked vnops. If the vnops were used more widely
  * in common code this would likely be preferable. However, currently
  * this is the only file where this is the case.
  */
 
 /*
  * Functions to replay ZFS intent log (ZIL) records
  * The functions are called through a function vector (zfs_replay_vector)
  * which is indexed by the transaction type.
  */
 
 static void
 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
     uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
 {
 	memset(vap, 0, sizeof (*vap));
 	vap->va_mask = (uint_t)mask;
 	vap->va_mode = mode;
 #if defined(__FreeBSD__) || defined(__APPLE__)
 	vap->va_type = IFTOVT(mode);
 #endif
 	vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
 	vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
 	vap->va_rdev = zfs_cmpldev(rdev);
 	vap->va_nodeid = nodeid;
 }
 
 static int
 zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
 {
 	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
 static void
 zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 {
 	xoptattr_t *xoap = NULL;
 	uint64_t *attrs;
 	uint64_t *crtime;
 	uint32_t *bitmap;
 	void *scanstamp;
 	int i;
 
 	xvap->xva_vattr.va_mask |= ATTR_XVATTR;
 	if ((xoap = xva_getxoptattr(xvap)) == NULL) {
 		xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */
 		return;
 	}
 
 	ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
 
 	bitmap = &lrattr->lr_attr_bitmap;
 	for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
 		xvap->xva_reqattrmap[i] = *bitmap;
 
 	attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
 	crtime = attrs + 1;
 	scanstamp = (caddr_t)(crtime + 2);
 
 	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
 		xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
 		xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
 		xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_READONLY))
 		xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
 		xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
 		xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
 		xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
 		xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
 		xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
 		xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
 		xoap->xoa_av_quarantined =
 		    ((*attrs & XAT0_AV_QUARANTINED) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 		ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
 
 		memcpy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
 	} else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 		/*
 		 * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
 		 * at the same time, so we can share the same space.
 		 */
 		memcpy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t));
 	}
 	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
 		xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
 		xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
 	if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
 		xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0);
 }
 
 static int
 zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
 {
 	uint64_t uid_idx;
 	uint64_t gid_idx;
 	int domcnt = 0;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 	if (uid_idx)
 		domcnt++;
 	if (gid_idx > 0 && gid_idx != uid_idx)
 		domcnt++;
 
 	return (domcnt);
 }
 
 static void *
 zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
     int domcnt)
 {
 	int i;
 
 	for (i = 0; i != domcnt; i++) {
 		fuid_infop->z_domain_table[i] = start;
 		start = (caddr_t)start + strlen(start) + 1;
 	}
 
 	return (start);
 }
 
 /*
  * Set the uid/gid in the fuid_info structure.
  */
 static void
 zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
 {
 	/*
 	 * If owner or group are log specific FUIDs then slurp up
 	 * domain information and build zfs_fuid_info_t
 	 */
 	if (IS_EPHEMERAL(uid))
 		fuid_infop->z_fuid_owner = uid;
 
 	if (IS_EPHEMERAL(gid))
 		fuid_infop->z_fuid_group = gid;
 }
 
 /*
  * Load fuid domains into fuid_info_t
  */
 static zfs_fuid_info_t *
 zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
 {
 	int domcnt;
 
 	zfs_fuid_info_t *fuid_infop;
 
 	fuid_infop = zfs_fuid_info_alloc();
 
 	domcnt = zfs_replay_domain_cnt(uid, gid);
 
 	if (domcnt == 0)
 		return (fuid_infop);
 
 	fuid_infop->z_domain_table =
 	    kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
 
 	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
 
 	fuid_infop->z_domain_cnt = domcnt;
 	*end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
 	return (fuid_infop);
 }
 
 /*
  * load zfs_fuid_t's and fuid_domains into fuid_info_t
  */
 static zfs_fuid_info_t *
 zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
     uint64_t gid)
 {
 	uint64_t *log_fuid = (uint64_t *)start;
 	zfs_fuid_info_t *fuid_infop;
 	int i;
 
 	fuid_infop = zfs_fuid_info_alloc();
 	fuid_infop->z_domain_cnt = domcnt;
 
 	fuid_infop->z_domain_table =
 	    kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP);
 
 	for (i = 0; i != idcnt; i++) {
 		zfs_fuid_t *zfuid;
 
 		zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
 		zfuid->z_logfuid = *log_fuid;
 		zfuid->z_id = -1;
 		zfuid->z_domidx = 0;
 		list_insert_tail(&fuid_infop->z_fuids, zfuid);
 		log_fuid++;
 	}
 
 	zfs_replay_fuid_ugid(fuid_infop, uid, gid);
 
 	*end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
 	return (fuid_infop);
 }
 
 static void
 zfs_replay_swap_attrs(lr_attr_t *lrattr)
 {
 	/* swap the lr_attr structure */
 	byteswap_uint32_array(lrattr, sizeof (*lrattr));
 	/* swap the bitmap */
 	byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
 	    sizeof (uint32_t));
 	/* swap the attributes, create time + 64 bit word for attributes */
 	byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
 	    (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
 }
 
 /*
  * Replay file create with optional ACL, xvattr information as well
  * as option FUID information.
  */
 static int
 zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_create_t *lracl = arg2;
 	char *name = NULL;		/* location determined later */
 	lr_create_t *lr = (lr_create_t *)lracl;
 	znode_t *dzp;
 	znode_t *zp;
 	xvattr_t xva;
 	int vflg = 0;
 	vsecattr_t vsec = { 0 };
 	lr_attr_t *lrattr;
 	void *aclstart;
 	void *fuidstart;
 	size_t xvatlen = 0;
 	uint64_t txtype;
 	uint64_t objid;
 	uint64_t dnodesize;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lracl, sizeof (*lracl));
 		if (txtype == TX_CREATE_ACL_ATTR ||
 		    txtype == TX_MKDIR_ACL_ATTR) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 			zfs_replay_swap_attrs(lrattr);
 			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 		}
 
 		aclstart = (caddr_t)(lracl + 1) + xvatlen;
 		zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
 		/* swap fuids */
 		if (lracl->lr_fuidcnt) {
 			byteswap_uint64_array((caddr_t)aclstart +
 			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
 			    lracl->lr_fuidcnt * sizeof (uint64_t));
 		}
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	objid = LR_FOID_GET_OBJ(lr->lr_foid);
 	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
 
 	xva_init(&xva);
 	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
 
 	/*
 	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 	 * eventually end up in zfs_mknode(), which assigns the object's
 	 * creation time, generation number, and dnode size. The generic
 	 * zfs_create() has no concept of these attributes, so we smuggle
 	 * the values inside the vattr's otherwise unused va_ctime,
 	 * va_nblocks, and va_fsid fields.
 	 */
 	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
 	xva.xva_vattr.va_nblocks = lr->lr_gen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
 	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
 	if (error)
 		goto bail;
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 	switch (txtype) {
 	case TX_CREATE_ACL:
 		aclstart = (caddr_t)(lracl + 1);
 		fuidstart = (caddr_t)aclstart +
 		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 		    lr->lr_uid, lr->lr_gid);
 		zfs_fallthrough;
 	case TX_CREATE_ACL_ATTR:
 		if (name == NULL) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 			xva.xva_vattr.va_mask |= ATTR_XVATTR;
 			zfs_replay_xvattr(lrattr, &xva);
 		}
 		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
 		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
 		vsec.vsa_aclcnt = lracl->lr_aclcnt;
 		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
 		vsec.vsa_aclflags = lracl->lr_acl_flags;
 		if (zfsvfs->z_fuid_replay == NULL) {
 			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
 			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 			zfsvfs->z_fuid_replay =
 			    zfs_replay_fuids(fuidstart,
 			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 			    lr->lr_uid, lr->lr_gid);
 		}
 
 #if defined(__linux__)
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap);
 #else
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, &vsec, NULL);
 #endif
 		break;
 	case TX_MKDIR_ACL:
 		aclstart = (caddr_t)(lracl + 1);
 		fuidstart = (caddr_t)aclstart +
 		    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 		zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 		    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 		    lr->lr_uid, lr->lr_gid);
 		zfs_fallthrough;
 	case TX_MKDIR_ACL_ATTR:
 		if (name == NULL) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 			xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 			zfs_replay_xvattr(lrattr, &xva);
 		}
 		vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
 		vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
 		vsec.vsa_aclcnt = lracl->lr_aclcnt;
 		vsec.vsa_aclentsz = lracl->lr_acl_bytes;
 		vsec.vsa_aclflags = lracl->lr_acl_flags;
 		if (zfsvfs->z_fuid_replay == NULL) {
 			fuidstart = (caddr_t)(lracl + 1) + xvatlen +
 			    ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 			zfsvfs->z_fuid_replay =
 			    zfs_replay_fuids(fuidstart,
 			    (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 			    lr->lr_uid, lr->lr_gid);
 		}
 #if defined(__linux__)
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, &vsec, zfs_init_idmap);
 #else
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, &vsec, NULL);
 #endif
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
 	}
 
 bail:
 	if (error == 0 && zp != NULL) {
 #ifdef __FreeBSD__
 		VOP_UNLOCK1(ZTOV(zp));
 #endif
 		zrele(zp);
 	}
 	zrele(dzp);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 
 	return (error);
 }
 
 static int
 zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_create_t *lr = arg2;
 	char *name = NULL;		/* location determined later */
 	char *link;			/* symlink content follows name */
 	znode_t *dzp;
 	znode_t *zp = NULL;
 	xvattr_t xva;
 	int vflg = 0;
 	size_t lrsize = sizeof (lr_create_t);
 	lr_attr_t *lrattr;
 	void *start;
 	size_t xvatlen;
 	uint64_t txtype;
 	uint64_t objid;
 	uint64_t dnodesize;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
 
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	objid = LR_FOID_GET_OBJ(lr->lr_foid);
 	dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
 
 	xva_init(&xva);
 	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
 	    lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
 
 	/*
 	 * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 	 * eventually end up in zfs_mknode(), which assigns the object's
 	 * creation time, generation number, and dnode slot count. The
 	 * generic zfs_create() has no concept of these attributes, so
 	 * we smuggle the values inside the vattr's otherwise unused
 	 * va_ctime, va_nblocks, and va_fsid fields.
 	 */
 	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
 	xva.xva_vattr.va_nblocks = lr->lr_gen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
 	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
 	if (error)
 		goto out;
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 	/*
 	 * Symlinks don't have fuid info, and CIFS never creates
 	 * symlinks.
 	 *
 	 * The _ATTR versions will grab the fuid info in their subcases.
 	 */
 	if (txtype != TX_SYMLINK &&
 	    txtype != TX_MKDIR_ATTR &&
 	    txtype != TX_CREATE_ATTR) {
 		start = (lr + 1);
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 	}
 
 	switch (txtype) {
 	case TX_CREATE_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
 		start = (caddr_t)(lr + 1) + xvatlen;
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 		name = (char *)start;
 		zfs_fallthrough;
 
 	case TX_CREATE:
 		if (name == NULL)
 			name = (char *)start;
 
 #if defined(__linux__)
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap);
 #else
 		error = zfs_create(dzp, name, &xva.xva_vattr,
 		    0, 0, &zp, kcred, vflg, NULL, NULL);
 #endif
 		break;
 	case TX_MKDIR_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 		zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
 		start = (caddr_t)(lr + 1) + xvatlen;
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuid_domain(start, &start,
 		    lr->lr_uid, lr->lr_gid);
 		name = (char *)start;
 		zfs_fallthrough;
 
 	case TX_MKDIR:
 		if (name == NULL)
 			name = (char *)(lr + 1);
 
 #if defined(__linux__)
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, NULL, zfs_init_idmap);
 #else
 		error = zfs_mkdir(dzp, name, &xva.xva_vattr,
 		    &zp, kcred, vflg, NULL, NULL);
 #endif
 
 		break;
 	case TX_MKXATTR:
 		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred);
 		break;
 	case TX_SYMLINK:
 		name = (char *)(lr + 1);
 		link = name + strlen(name) + 1;
 #if defined(__linux__)
 		error = zfs_symlink(dzp, name, &xva.xva_vattr,
 		    link, &zp, kcred, vflg, zfs_init_idmap);
 #else
 		error = zfs_symlink(dzp, name, &xva.xva_vattr,
 		    link, &zp, kcred, vflg, NULL);
 #endif
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
 	}
 
 out:
 	if (error == 0 && zp != NULL) {
 #ifdef __FreeBSD__
 		VOP_UNLOCK1(ZTOV(zp));
 #endif
 		zrele(zp);
 	}
 	zrele(dzp);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 	return (error);
 }
 
 static int
 zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_remove_t *lr = arg2;
 	char *name = (char *)(lr + 1);	/* name follows lr_remove_t */
 	znode_t *dzp;
 	int error;
 	int vflg = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 	switch ((int)lr->lr_common.lrc_txtype) {
 	case TX_REMOVE:
 		error = zfs_remove(dzp, name, kcred, vflg);
 		break;
 	case TX_RMDIR:
 		error = zfs_rmdir(dzp, name, NULL, kcred, vflg);
 		break;
 	default:
 		error = SET_ERROR(ENOTSUP);
 	}
 
 	zrele(dzp);
 
 	return (error);
 }
 
 static int
 zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_link_t *lr = arg2;
 	char *name = (char *)(lr + 1);	/* name follows lr_link_t */
 	znode_t *dzp, *zp;
 	int error;
 	int vflg = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
 		zrele(dzp);
 		return (error);
 	}
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 	error = zfs_link(dzp, zp, name, kcred, vflg);
 	zrele(zp);
 	zrele(dzp);
 
 	return (error);
 }
 
 static int
 do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname,
     char *tname, uint64_t rflags, vattr_t *wo_vap)
 {
 	znode_t *sdzp, *tdzp;
 	int error, vflg = 0;
 
 	/* Only Linux currently supports RENAME_* flags. */
 #ifdef __linux__
 	VERIFY0(rflags & ~(RENAME_EXCHANGE | RENAME_WHITEOUT));
 
 	/* wo_vap must be non-NULL iff. we're doing RENAME_WHITEOUT */
 	VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
 #else
 	VERIFY0(rflags);
 #endif
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
 		return (error);
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
 		zrele(sdzp);
 		return (error);
 	}
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
 
 #if defined(__linux__)
 	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
 	    wo_vap, zfs_init_idmap);
 #else
 	error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags,
 	    wo_vap, NULL);
 #endif
 
 	zrele(tdzp);
 	zrele(sdzp);
 	return (error);
 }
 
 static int
 zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL));
 }
 
 static int
 zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap)
 {
 #ifdef __linux__
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_t *lr = arg2;
-	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
-	char *tname = sname + strlen(sname) + 1;
+
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	char *sname = (char *)(lr + 1);	/* sname and tname follow lr_rename_t */
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE,
 	    NULL));
 #else
 	return (SET_ERROR(ENOTSUP));
 #endif
 }
 
 static int
 zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap)
 {
 #ifdef __linux__
 	zfsvfs_t *zfsvfs = arg1;
 	lr_rename_whiteout_t *lr = arg2;
 	int error;
-	/* sname and tname follow lr_rename_whiteout_t */
-	char *sname = (char *)(lr + 1);
-	char *tname = sname + strlen(sname) + 1;
 	/* For the whiteout file. */
 	xvattr_t xva;
 	uint64_t objid;
 	uint64_t dnodesize;
 
+	ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	objid = LR_FOID_GET_OBJ(lr->lr_wfoid);
 	dnodesize = LR_FOID_GET_SLOTS(lr->lr_wfoid) << DNODE_SHIFT;
 
 	xva_init(&xva);
 	zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID,
 	    lr->lr_wmode, lr->lr_wuid, lr->lr_wgid, lr->lr_wrdev, objid);
 
 	/*
 	 * As with TX_CREATE, RENAME_WHITEOUT ends up in zfs_mknode(), which
 	 * assigns the object's creation time, generation number, and dnode
 	 * slot count. The generic zfs_rename() has no concept of these
 	 * attributes, so we smuggle the values inside the vattr's otherwise
 	 * unused va_ctime, va_nblocks, and va_fsid fields.
 	 */
 	ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_wcrtime);
 	xva.xva_vattr.va_nblocks = lr->lr_wgen;
 	xva.xva_vattr.va_fsid = dnodesize;
 
 	error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT);
 	if (error)
 		return (error);
 
+	/* sname and tname follow lr_rename_whiteout_t */
+	char *sname = (char *)(lr + 1);
+	char *tname = sname + strlen(sname) + 1;
 	return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname,
 	    RENAME_WHITEOUT, &xva.xva_vattr));
 #else
 	return (SET_ERROR(ENOTSUP));
 #endif
 }
 
 static int
 zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_write_t *lr = arg2;
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
 	znode_t	*zp;
 	int error;
 	uint64_t eod, offset, length;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * As we can log writes out of order, it's possible the
 		 * file has been removed. In this case just drop the write
 		 * and return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 	eod = offset + length;	/* end of data for this write */
 
 	/*
 	 * This may be a write from a dmu_sync() for a whole block,
 	 * and may extend beyond the current end of the file.
 	 * We can't just replay what was written for this TX_WRITE as
 	 * a future TX_WRITE2 may extend the eof and the data for that
 	 * write needs to be there. So we write the whole block and
 	 * reduce the eof. This needs to be done within the single dmu
 	 * transaction created within vn_rdwr -> zfs_write. So a possible
 	 * new end of file is passed through in zfsvfs->z_replay_eof
 	 */
 
 	zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 		if (zp->z_size < eod)
 			zfsvfs->z_replay_eof = eod;
 	}
 	error = zfs_write_simple(zp, data, length, offset, NULL);
 	zrele(zp);
 	zfsvfs->z_replay_eof = 0;	/* safety */
 
 	return (error);
 }
 
 /*
  * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
  * meaning the pool block is already being synced. So now that we always write
  * out full blocks, all we have to do is expand the eof if
  * the file is grown.
  */
 static int
 zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_write_t *lr = arg2;
 	znode_t	*zp;
 	int error;
 	uint64_t end;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 top:
 	end = lr->lr_offset + lr->lr_length;
 	if (end > zp->z_size) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		zp->z_size = end;
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			zrele(zp);
 			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			dmu_tx_abort(tx);
 			return (error);
 		}
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 		    (void *)&zp->z_size, sizeof (uint64_t), tx);
 
 		/* Ensure the replayed seq is updated */
 		(void) zil_replaying(zfsvfs->z_log, tx);
 
 		dmu_tx_commit(tx);
 	}
 
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_truncate_t *lr = arg2;
 	znode_t *zp;
 	flock64_t fl = {0};
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	fl.l_type = F_WRLCK;
 	fl.l_whence = SEEK_SET;
 	fl.l_start = lr->lr_offset;
 	fl.l_len = lr->lr_length;
 
 	error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE,
 	    lr->lr_offset, kcred);
 
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_setattr_t *lr = arg2;
 	znode_t *zp;
 	xvattr_t xva;
 	vattr_t *vap = &xva.xva_vattr;
 	int error;
 	void *start;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	xva_init(&xva);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 		if ((lr->lr_mask & ATTR_XVATTR) &&
 		    zfsvfs->z_version >= ZPL_VERSION_INITIAL)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
 
 	vap->va_size = lr->lr_size;
 	ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
 	gethrestime(&vap->va_ctime);
 	vap->va_mask |= ATTR_CTIME;
 
 	/*
 	 * Fill in xvattr_t portions if necessary.
 	 */
 
 	start = (lr_setattr_t *)(lr + 1);
 	if (vap->va_mask & ATTR_XVATTR) {
 		zfs_replay_xvattr((lr_attr_t *)start, &xva);
 		start = (caddr_t)start +
 		    ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
 	} else
 		xva.xva_vattr.va_mask &= ~ATTR_XVATTR;
 
 	zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
 	    lr->lr_uid, lr->lr_gid);
 
 #if defined(__linux__)
 	error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap);
 #else
 	error = zfs_setattr(zp, vap, 0, kcred, NULL);
 #endif
 
 	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_setsaxattr_t *lr = arg2;
 	znode_t *zp;
 	nvlist_t *nvl;
 	size_t sa_size;
 	char *name;
 	char *value;
 	size_t size;
 	int error = 0;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size);
+
 	ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa,
 	    SPA_FEATURE_ZILSAXATTR));
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 	mutex_enter(&zp->z_lock);
 	if (zp->z_xattr_cached == NULL)
 		error = zfs_sa_get_xattr(zp);
 	mutex_exit(&zp->z_lock);
 
 	if (error)
 		goto out;
 
 	ASSERT(zp->z_xattr_cached);
 	nvl = zp->z_xattr_cached;
 
 	/* Get xattr name, value and size from log record */
 	size = lr->lr_size;
 	name = (char *)(lr + 1);
 	if (size == 0) {
 		value = NULL;
 		error = nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY);
 	} else {
 		value = name + strlen(name) + 1;
 		/* Limited to 32k to keep nvpair memory allocations small */
 		if (size > DXATTR_MAX_ENTRY_SIZE) {
 			error = SET_ERROR(EFBIG);
 			goto out;
 		}
 
 		/* Prevent the DXATTR SA from consuming the entire SA region */
 		error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
 		if (error)
 			goto out;
 
 		if (sa_size > DXATTR_MAX_SA_SIZE) {
 			error = SET_ERROR(EFBIG);
 			goto out;
 		}
 
 		error = nvlist_add_byte_array(nvl, name, (uchar_t *)value,
 		    size);
 	}
 
 	/*
 	 * Update the SA for additions, modifications, and removals. On
 	 * error drop the inconsistent cached version of the nvlist, it
 	 * will be reconstructed from the ARC when next accessed.
 	 */
 	if (error == 0)
 		error = zfs_sa_set_xattr(zp, name, value, size);
 
 	if (error) {
 		nvlist_free(nvl);
 		zp->z_xattr_cached = NULL;
 	}
 
 out:
 	rw_exit(&zp->z_xattr_lock);
 	zrele(zp);
 	return (error);
 }
 
 static int
 zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_v0_t *lr = arg2;
 	ace_t *ace = (ace_t *)(lr + 1);	/* ace array follows lr_acl_t */
 	vsecattr_t vsa = {0};
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) +
+	    sizeof (ace_t) * lr->lr_aclcnt);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
 	vsa.vsa_aclflags = 0;
 	vsa.vsa_aclentp = ace;
 
 	error = zfs_setsecattr(zp, &vsa, 0, kcred);
 
 	zrele(zp);
 
 	return (error);
 }
 
 /*
  * Replaying ACLs is complicated by FUID support.
  * The log record may contain some optional data
  * to be used for replaying FUID's.  These pieces
  * are the actual FUIDs that were created initially.
  * The FUID table index may no longer be valid and
  * during zfs_create() a new index may be assigned.
  * Because of this the log will contain the original
  * domain+rid in order to create a new FUID.
  *
  * The individual ACEs may contain an ephemeral uid/gid which is no
  * longer valid and will need to be replaced with an actual FUID.
  *
  */
 static int
 zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_acl_t *lr = arg2;
 	ace_t *ace = (ace_t *)(lr + 1);
 	vsecattr_t vsa = {0};
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes);
+
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
 		zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
 		if (lr->lr_fuidcnt) {
 			byteswap_uint64_array((caddr_t)ace +
 			    ZIL_ACE_LENGTH(lr->lr_acl_bytes),
 			    lr->lr_fuidcnt * sizeof (uint64_t));
 		}
 	}
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
 
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 	vsa.vsa_aclcnt = lr->lr_aclcnt;
 	vsa.vsa_aclentp = ace;
 	vsa.vsa_aclentsz = lr->lr_acl_bytes;
 	vsa.vsa_aclflags = lr->lr_acl_flags;
 
 	if (lr->lr_fuidcnt) {
 		void *fuidstart = (caddr_t)ace +
 		    ZIL_ACE_LENGTH(lr->lr_acl_bytes);
 
 		zfsvfs->z_fuid_replay =
 		    zfs_replay_fuids(fuidstart, &fuidstart,
 		    lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
 	}
 
 	error = zfs_setsecattr(zp, &vsa, 0, kcred);
 
 	if (zfsvfs->z_fuid_replay)
 		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 
 	zfsvfs->z_fuid_replay = NULL;
 	zrele(zp);
 
 	return (error);
 }
 
 static int
 zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zfsvfs_t *zfsvfs = arg1;
 	lr_clone_range_t *lr = arg2;
 	znode_t *zp;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 		/*
 		 * Clones can be logged out of order, so don't be surprised if
 		 * the file is gone - just return success.
 		 */
 		if (error == ENOENT)
 			error = 0;
 		return (error);
 	}
 
 	error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
 	    lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
 
 	zrele(zp);
 	return (error);
 }
 
 /*
  * Callback vectors for replaying records
  */
 zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_error,	/* no such type */
 	zfs_replay_create,	/* TX_CREATE */
 	zfs_replay_create,	/* TX_MKDIR */
 	zfs_replay_create,	/* TX_MKXATTR */
 	zfs_replay_create,	/* TX_SYMLINK */
 	zfs_replay_remove,	/* TX_REMOVE */
 	zfs_replay_remove,	/* TX_RMDIR */
 	zfs_replay_link,	/* TX_LINK */
 	zfs_replay_rename,	/* TX_RENAME */
 	zfs_replay_write,	/* TX_WRITE */
 	zfs_replay_truncate,	/* TX_TRUNCATE */
 	zfs_replay_setattr,	/* TX_SETATTR */
 	zfs_replay_acl_v0,	/* TX_ACL_V0 */
 	zfs_replay_acl,		/* TX_ACL */
 	zfs_replay_create_acl,	/* TX_CREATE_ACL */
 	zfs_replay_create,	/* TX_CREATE_ATTR */
 	zfs_replay_create_acl,	/* TX_CREATE_ACL_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
 	zfs_replay_create,	/* TX_MKDIR_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
 	zfs_replay_write2,	/* TX_WRITE2 */
 	zfs_replay_setsaxattr,	/* TX_SETSAXATTR */
 	zfs_replay_rename_exchange,	/* TX_RENAME_EXCHANGE */
 	zfs_replay_rename_whiteout,	/* TX_RENAME_WHITEOUT */
 	zfs_replay_clone_range,	/* TX_CLONE_RANGE */
 };
diff --git a/sys/contrib/openzfs/module/zfs/zfs_vnops.c b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
index 635d17455981..eb012fe549dc 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_vnops.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_vnops.c
@@ -1,1499 +1,1499 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/uio_impl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/policy.h>
 #include <sys/zfeature.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 
 
 int
 zfs_fsync(znode_t *zp, int syncflag, cred_t *cr)
 {
 	int error = 0;
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 
 	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
 		atomic_inc_32(&zp->z_sync_writes_cnt);
 		zil_commit(zfsvfs->z_log, zp->z_id);
 		atomic_dec_32(&zp->z_sync_writes_cnt);
 		zfs_exit(zfsvfs, FTAG);
 	}
 	return (error);
 }
 
 
 #if defined(SEEK_HOLE) && defined(SEEK_DATA)
 /*
  * Lseek support for finding holes (cmd == SEEK_HOLE) and
  * data (cmd == SEEK_DATA). "off" is an in/out parameter.
  */
 static int
 zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off)
 {
 	zfs_locked_range_t *lr;
 	uint64_t noff = (uint64_t)*off; /* new offset */
 	uint64_t file_sz;
 	int error;
 	boolean_t hole;
 
 	file_sz = zp->z_size;
 	if (noff >= file_sz)  {
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (cmd == F_SEEK_HOLE)
 		hole = B_TRUE;
 	else
 		hole = B_FALSE;
 
 	/* Flush any mmap()'d data to disk */
 	if (zn_has_cached_data(zp, 0, file_sz - 1))
 		zn_flush_cached_data(zp, B_FALSE);
 
 	lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER);
 	error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff);
 	zfs_rangelock_exit(lr);
 
 	if (error == ESRCH)
 		return (SET_ERROR(ENXIO));
 
 	/* File was dirty, so fall back to using generic logic */
 	if (error == EBUSY) {
 		if (hole)
 			*off = file_sz;
 
 		return (0);
 	}
 
 	/*
 	 * We could find a hole that begins after the logical end-of-file,
 	 * because dmu_offset_next() only works on whole blocks.  If the
 	 * EOF falls mid-block, then indicate that the "virtual hole"
 	 * at the end of the file begins at the logical EOF, rather than
 	 * at the end of the last block.
 	 */
 	if (noff > file_sz) {
 		ASSERT(hole);
 		noff = file_sz;
 	}
 
 	if (noff < *off)
 		return (error);
 	*off = noff;
 	return (error);
 }
 
 int
 zfs_holey(znode_t *zp, ulong_t cmd, loff_t *off)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	error = zfs_holey_common(zp, cmd, off);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 #endif /* SEEK_HOLE && SEEK_DATA */
 
 int
 zfs_access(znode_t *zp, int mode, int flag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (flag & V_ACE_MASK)
 #if defined(__linux__)
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 		    zfs_init_idmap);
 #else
 		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr,
 		    NULL);
 #endif
 	else
 #if defined(__linux__)
 		error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap);
 #else
 		error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL);
 #endif
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */
 
 /*
  * Read bytes from specified file into supplied buffer.
  *
  *	IN:	zp	- inode of file to be read from.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		ioflag	- O_SYNC flags; used to provide FRSYNC semantics.
  *			  O_DIRECT flag; used to bypass page cache.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Side Effects:
  *	inode - atime updated if byte count > 0
  */
 int
 zfs_read(struct znode *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 {
 	(void) cr;
 	int error = 0;
 	boolean_t frsync = B_FALSE;
 
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EACCES));
 	}
 
 	/* We don't copy out anything useful for directories. */
 	if (Z_ISDIR(ZTOTYPE(zp))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EISDIR));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	if (zfs_uio_offset(uio) < (offset_t)0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Fasttrack empty reads
 	 */
 	if (zfs_uio_resid(uio) == 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 #ifdef FRSYNC
 	/*
 	 * If we're in FRSYNC mode, sync out this znode before reading it.
 	 * Only do this for non-snapshots.
 	 *
 	 * Some platforms do not support FRSYNC and instead map it
 	 * to O_SYNC, which results in unnecessary calls to zil_commit. We
 	 * only honor FRSYNC requests on platforms which support it.
 	 */
 	frsync = !!(ioflag & FRSYNC);
 #endif
 	if (zfsvfs->z_log &&
 	    (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	/*
 	 * Lock the range against changes.
 	 */
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 	    zfs_uio_offset(uio), zfs_uio_resid(uio), RL_READER);
 
 	/*
 	 * If we are reading past end-of-file we can skip
 	 * to the end; but we might still need to set atime.
 	 */
 	if (zfs_uio_offset(uio) >= zp->z_size) {
 		error = 0;
 		goto out;
 	}
 
 	ASSERT(zfs_uio_offset(uio) < zp->z_size);
 #if defined(__linux__)
 	ssize_t start_offset = zfs_uio_offset(uio);
 #endif
 	ssize_t n = MIN(zfs_uio_resid(uio), zp->z_size - zfs_uio_offset(uio));
 	ssize_t start_resid = n;
 
 	while (n > 0) {
 		ssize_t nbytes = MIN(n, zfs_vnops_read_chunk_size -
 		    P2PHASE(zfs_uio_offset(uio), zfs_vnops_read_chunk_size));
 #ifdef UIO_NOCOPY
 		if (zfs_uio_segflg(uio) == UIO_NOCOPY)
 			error = mappedread_sf(zp, nbytes, uio);
 		else
 #endif
 		if (zn_has_cached_data(zp, zfs_uio_offset(uio),
 		    zfs_uio_offset(uio) + nbytes - 1) && !(ioflag & O_DIRECT)) {
 			error = mappedread(zp, nbytes, uio);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes);
 		}
 
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 
 #if defined(__linux__)
 			/*
 			 * if we actually read some bytes, bubbling EFAULT
 			 * up to become EAGAIN isn't what we want here...
 			 *
 			 * ...on Linux, at least. On FBSD, doing this breaks.
 			 */
 			if (error == EFAULT &&
 			    (zfs_uio_offset(uio) - start_offset) != 0)
 				error = 0;
 #endif
 			break;
 		}
 
 		n -= nbytes;
 	}
 
 	int64_t nread = start_resid - n;
 	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
 	task_io_account_read(nread);
 out:
 	zfs_rangelock_exit(lr);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 static void
 zfs_clear_setid_bits_if_necessary(zfsvfs_t *zfsvfs, znode_t *zp, cred_t *cr,
     uint64_t *clear_setid_bits_txgp, dmu_tx_t *tx)
 {
 	zilog_t *zilog = zfsvfs->z_log;
 	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 
 	ASSERT(clear_setid_bits_txgp != NULL);
 	ASSERT(tx != NULL);
 
 	/*
 	 * Clear Set-UID/Set-GID bits on successful write if not
 	 * privileged and at least one of the execute bits is set.
 	 *
 	 * It would be nice to do this after all writes have
 	 * been done, but that would still expose the ISUID/ISGID
 	 * to another app after the partial write is committed.
 	 *
 	 * Note: we don't call zfs_fuid_map_id() here because
 	 * user 0 is not an ephemeral uid.
 	 */
 	mutex_enter(&zp->z_acl_lock);
 	if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 &&
 	    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 	    secpolicy_vnode_setid_retain(zp, cr,
 	    ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) {
 		uint64_t newmode;
 
 		zp->z_mode &= ~(S_ISUID | S_ISGID);
 		newmode = zp->z_mode;
 		(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 		    (void *)&newmode, sizeof (uint64_t), tx);
 
 		mutex_exit(&zp->z_acl_lock);
 
 		/*
 		 * Make sure SUID/SGID bits will be removed when we replay the
 		 * log. If the setid bits are keep coming back, don't log more
 		 * than one TX_SETATTR per transaction group.
 		 */
 		if (*clear_setid_bits_txgp != dmu_tx_get_txg(tx)) {
 			vattr_t va = {0};
 
 			va.va_mask = ATTR_MODE;
 			va.va_nodeid = zp->z_id;
 			va.va_mode = newmode;
 			zfs_log_setattr(zilog, tx, TX_SETATTR, zp, &va,
 			    ATTR_MODE, NULL);
 			*clear_setid_bits_txgp = dmu_tx_get_txg(tx);
 		}
 	} else {
 		mutex_exit(&zp->z_acl_lock);
 	}
 }
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	zp	- znode of file to be written to.
  *		uio	- structure supplying write location, range info,
  *			  and data buffer.
  *		ioflag	- O_APPEND flag set if in append mode.
  *			  O_DIRECT flag; used to bypass page cache.
  *		cr	- credentials of caller.
  *
  *	OUT:	uio	- updated offset and range.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime|mtime updated if byte count > 0
  */
 int
 zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
 {
 	int error = 0, error1;
 	ssize_t start_resid = zfs_uio_resid(uio);
 	uint64_t clear_setid_bits_txg = 0;
 
 	/*
 	 * Fasttrack empty write
 	 */
 	ssize_t n = start_resid;
 	if (n == 0)
 		return (0);
 
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 	uint64_t mtime[2], ctime[2];
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM.
 	 * Intentionally allow ZFS_READONLY through here.
 	 * See zfs_zaccess_common()
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) &&
 	    (zfs_uio_offset(uio) < zp->z_size))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Validate file offset
 	 */
 	offset_t woff = ioflag & O_APPEND ? zp->z_size : zfs_uio_offset(uio);
 	if (woff < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
 	 */
 	ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
 	if (zfs_uio_prefaultpages(pfbytes, uio)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFAULT));
 	}
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	zfs_locked_range_t *lr;
 	if (ioflag & O_APPEND) {
 		/*
 		 * Obtain an appending range lock to guarantee file append
 		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 		woff = lr->lr_offset;
 		if (lr->lr_length == UINT64_MAX) {
 			/*
 			 * We overlocked the file because this write will cause
 			 * the file block size to increase.
 			 * Note that zp_size cannot change with this lock held.
 			 */
 			woff = zp->z_size;
 		}
 		zfs_uio_setoffset(uio, woff);
 	} else {
 		/*
 		 * Note that if the file block size will change as a result of
 		 * this write, then this range lock will lock the entire file
 		 * so that we can re-write the block safely.
 		 */
 		lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 	}
 
 	if (zn_rlimit_fsize_uio(zp, uio)) {
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFBIG));
 	}
 
 	const rlim64_t limit = MAXOFFSET_T;
 
 	if (woff >= limit) {
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EFBIG));
 	}
 
 	if (n > limit - woff)
 		n = limit - woff;
 
 	uint64_t end_size = MAX(zp->z_size, woff + n);
 	zilog_t *zilog = zfsvfs->z_log;
 	boolean_t commit = (ioflag & (O_SYNC | O_DSYNC)) ||
 	    (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS);
 
 	const uint64_t uid = KUID_TO_SUID(ZTOUID(zp));
 	const uint64_t gid = KGID_TO_SGID(ZTOGID(zp));
 	const uint64_t projid = zp->z_projid;
 
 	/*
 	 * Write the file in reasonable size chunks.  Each chunk is written
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
 		woff = zfs_uio_offset(uio);
 
 		if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) ||
 		    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) ||
 		    (projid != ZFS_DEFAULT_PROJID &&
 		    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 		    projid))) {
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		uint64_t blksz;
 		if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
 			if (zp->z_blksz > zfsvfs->z_max_blksz &&
 			    !ISP2(zp->z_blksz)) {
 				/*
 				 * File's blocksize is already larger than the
 				 * "recordsize" property.  Only let it grow to
 				 * the next power of 2.
 				 */
 				blksz = 1 << highbit64(zp->z_blksz);
 			} else {
 				blksz = zfsvfs->z_max_blksz;
 			}
 			blksz = MIN(blksz, P2ROUNDUP(end_size,
 			    SPA_MINBLOCKSIZE));
 			blksz = MAX(blksz, zp->z_blksz);
 		} else {
 			blksz = zp->z_blksz;
 		}
 
 		arc_buf_t *abuf = NULL;
 		ssize_t nbytes = n;
 		if (n >= blksz && woff >= zp->z_size &&
 		    P2PHASE(woff, blksz) == 0 &&
 		    (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
 			/*
 			 * This write covers a full block.  "Borrow" a buffer
 			 * from the dmu so that we can fill it before we enter
 			 * a transaction.  This avoids the possibility of
 			 * holding up the transaction if the data copy hangs
 			 * up on a pagefault (e.g., from an NFS server mapping).
 			 */
 			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 			    blksz);
 			ASSERT(abuf != NULL);
 			ASSERT(arc_buf_size(abuf) == blksz);
 			if ((error = zfs_uiocopy(abuf->b_data, blksz,
 			    UIO_WRITE, uio, &nbytes))) {
 				dmu_return_arcbuf(abuf);
 				break;
 			}
 			ASSERT3S(nbytes, ==, blksz);
 		} else {
 			nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
 			    P2PHASE(woff, blksz));
 			if (pfbytes < nbytes) {
 				if (zfs_uio_prefaultpages(nbytes, uio)) {
 					error = SET_ERROR(EFAULT);
 					break;
 				}
 				pfbytes = nbytes;
 			}
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 		DB_DNODE_ENTER(db);
 		dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
 		DB_DNODE_EXIT(db);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			if (abuf != NULL)
 				dmu_return_arcbuf(abuf);
 			break;
 		}
 
 		/*
 		 * NB: We must call zfs_clear_setid_bits_if_necessary before
 		 * committing the transaction!
 		 */
 
 		/*
 		 * If rangelock_enter() over-locked we grow the blocksize
 		 * and then reduce the lock range.  This will only happen
 		 * on the first iteration since rangelock_reduce() will
 		 * shrink down lr_length to the appropriate size.
 		 */
 		if (lr->lr_length == UINT64_MAX) {
 			zfs_grow_blocksize(zp, blksz, tx);
 			zfs_rangelock_reduce(lr, woff, n);
 		}
 
 		ssize_t tx_bytes;
 		if (abuf == NULL) {
 			tx_bytes = zfs_uio_resid(uio);
 			zfs_uio_fault_disable(uio, B_TRUE);
 			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, nbytes, tx);
 			zfs_uio_fault_disable(uio, B_FALSE);
 #ifdef __linux__
 			if (error == EFAULT) {
 				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 				    cr, &clear_setid_bits_txg, tx);
 				dmu_tx_commit(tx);
 				/*
 				 * Account for partial writes before
 				 * continuing the loop.
 				 * Update needs to occur before the next
 				 * zfs_uio_prefaultpages, or prefaultpages may
 				 * error, and we may break the loop early.
 				 */
 				n -= tx_bytes - zfs_uio_resid(uio);
 				pfbytes -= tx_bytes - zfs_uio_resid(uio);
 				continue;
 			}
 #endif
 			/*
 			 * On FreeBSD, EFAULT should be propagated back to the
 			 * VFS, which will handle faulting and will retry.
 			 */
 			if (error != 0 && error != EFAULT) {
 				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 				    cr, &clear_setid_bits_txg, tx);
 				dmu_tx_commit(tx);
 				break;
 			}
 			tx_bytes -= zfs_uio_resid(uio);
 		} else {
 			/*
 			 * Thus, we're writing a full block at a block-aligned
 			 * offset and extending the file past EOF.
 			 *
 			 * dmu_assign_arcbuf_by_dbuf() will directly assign the
 			 * arc buffer to a dbuf.
 			 */
 			error = dmu_assign_arcbuf_by_dbuf(
 			    sa_get_db(zp->z_sa_hdl), woff, abuf, tx);
 			if (error != 0) {
 				/*
 				 * XXX This might not be necessary if
 				 * dmu_assign_arcbuf_by_dbuf is guaranteed
 				 * to be atomic.
 				 */
 				zfs_clear_setid_bits_if_necessary(zfsvfs, zp,
 				    cr, &clear_setid_bits_txg, tx);
 				dmu_return_arcbuf(abuf);
 				dmu_tx_commit(tx);
 				break;
 			}
 			ASSERT3S(nbytes, <=, zfs_uio_resid(uio));
 			zfs_uioskip(uio, nbytes);
 			tx_bytes = nbytes;
 		}
 		if (tx_bytes &&
 		    zn_has_cached_data(zp, woff, woff + tx_bytes - 1) &&
 		    !(ioflag & O_DIRECT)) {
 			update_pages(zp, woff, tx_bytes, zfsvfs->z_os);
 		}
 
 		/*
 		 * If we made no progress, we're done.  If we made even
 		 * partial progress, update the znode and ZIL accordingly.
 		 */
 		if (tx_bytes == 0) {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 			    (void *)&zp->z_size, sizeof (uint64_t), tx);
 			dmu_tx_commit(tx);
 			ASSERT(error != 0);
 			break;
 		}
 
 		zfs_clear_setid_bits_if_necessary(zfsvfs, zp, cr,
 		    &clear_setid_bits_txg, tx);
 
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((end_size = zp->z_size) < zfs_uio_offset(uio)) {
 			(void) atomic_cas_64(&zp->z_size, end_size,
 			    zfs_uio_offset(uio));
 			ASSERT(error == 0 || error == EFAULT);
 		}
 		/*
 		 * If we are replaying and eof is non zero then force
 		 * the file size to the specified eof. Note, there's no
 		 * concurrency during replay.
 		 */
 		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 			zp->z_size = zfsvfs->z_replay_eof;
 
 		error1 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		if (error1 != 0)
 			/* Avoid clobbering EFAULT. */
 			error = error1;
 
 		/*
 		 * NB: During replay, the TX_SETATTR record logged by
 		 * zfs_clear_setid_bits_if_necessary must precede any of
 		 * the TX_WRITE records logged here.
 		 */
 		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, commit,
 		    NULL, NULL);
 
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 		ASSERT3S(tx_bytes, ==, nbytes);
 		n -= nbytes;
 		pfbytes -= nbytes;
 	}
 
 	zfs_znode_update_vfs(zp);
 	zfs_rangelock_exit(lr);
 
 	/*
 	 * If we're in replay mode, or we made no progress, or the
 	 * uio data is inaccessible return an error.  Otherwise, it's
 	 * at least a partial write, so it's successful.
 	 */
 	if (zfsvfs->z_replay || zfs_uio_resid(uio) == start_resid ||
 	    error == EFAULT) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (commit)
 		zil_commit(zilog, zp->z_id);
 
 	const int64_t nwritten = start_resid - zfs_uio_resid(uio);
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
 	task_io_account_write(nwritten);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 int
 zfs_getsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 int
 zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	zilog_t	*zilog = zfsvfs->z_log;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 #ifdef ZFS_DEBUG
 static int zil_fault_io = 0;
 #endif
 
 static void zfs_get_done(zgd_t *zgd, int error);
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf,
     struct lwb *lwb, zio_t *zio)
 {
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
 	uint64_t object = lr->lr_foid;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error = 0;
 	uint64_t zp_gen;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
 	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (SET_ERROR(ENOENT));
 	if (zp->z_unlinked) {
 		/*
 		 * Release the vnode asynchronously as we currently have the
 		 * txg stopped from syncing.
 		 */
 		zfs_zrele_async(zp);
 		return (SET_ERROR(ENOENT));
 	}
 	/* check if generation number matches */
 	if (sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 	    sizeof (zp_gen)) != 0) {
 		zfs_zrele_async(zp);
 		return (SET_ERROR(EIO));
 	}
 	if (zp_gen != gen) {
 		zfs_zrele_async(zp);
 		return (SET_ERROR(ENOENT));
 	}
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 	zgd->zgd_private = zp;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
 		    offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
 		if (offset >= zp->z_size) {
 			error = SET_ERROR(ENOENT);
 		} else {
 			error = dmu_read(os, object, offset, size, buf,
 			    DMU_READ_NO_PREFETCH);
 		}
 		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
 		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and its checksum is being calculated
 		 * that no one can change the data. We need to re-check
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
 			uint64_t blkoff;
 			size = zp->z_blksz;
 			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
 			offset -= blkoff;
 			zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock,
 			    offset, size, RL_READER);
 			if (zp->z_blksz == size)
 				break;
 			offset += blkoff;
 			zfs_rangelock_exit(zgd->zgd_lr);
 		}
 		/* test for truncation needs to be done while range locked */
 		if (lr->lr_offset >= zp->z_size)
 			error = SET_ERROR(ENOENT);
 #ifdef ZFS_DEBUG
 		if (zil_fault_io) {
 			error = SET_ERROR(EIO);
 			zil_fault_io = 0;
 		}
 #endif
 		if (error == 0)
 			error = dmu_buf_hold_noread(os, object, offset, zgd,
 			    &db);
 
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zfs_get_done, zgd);
 			ASSERT(error || lr->lr_length <= size);
 
 			/*
 			 * On success, we need to wait for the write I/O
 			 * initiated by dmu_sync() to complete before we can
 			 * release this dbuf.  We will finish everything up
 			 * in the zfs_get_done() callback.
 			 */
 			if (error == 0)
 				return (0);
 
 			if (error == EALREADY) {
 				lr->lr_common.lrc_txtype = TX_WRITE2;
 				/*
 				 * TX_WRITE2 relies on the data previously
 				 * written by the TX_WRITE that caused
 				 * EALREADY.  We zero out the BP because
 				 * it is the old, currently-on-disk BP.
 				 */
 				zgd->zgd_bp = NULL;
 				BP_ZERO(bp);
 				error = 0;
 			}
 		}
 	}
 
 	zfs_get_done(zgd, error);
 
 	return (error);
 }
 
 
 static void
 zfs_get_done(zgd_t *zgd, int error)
 {
 	(void) error;
 	znode_t *zp = zgd->zgd_private;
 
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_rangelock_exit(zgd->zgd_lr);
 
 	/*
 	 * Release the vnode asynchronously as we currently have the
 	 * txg stopped from syncing.
 	 */
 	zfs_zrele_async(zp);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 static int
 zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
 {
 	int error;
 
 	/* Swap. Not sure if the order of zfs_enter()s is important. */
 	if (zfsvfs1 > zfsvfs2) {
 		zfsvfs_t *tmpzfsvfs;
 
 		tmpzfsvfs = zfsvfs2;
 		zfsvfs2 = zfsvfs1;
 		zfsvfs1 = tmpzfsvfs;
 	}
 
 	error = zfs_enter(zfsvfs1, tag);
 	if (error != 0)
 		return (error);
 	if (zfsvfs1 != zfsvfs2) {
 		error = zfs_enter(zfsvfs2, tag);
 		if (error != 0) {
 			zfs_exit(zfsvfs1, tag);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static void
 zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
 {
 
 	zfs_exit(zfsvfs1, tag);
 	if (zfsvfs1 != zfsvfs2)
 		zfs_exit(zfsvfs2, tag);
 }
 
 /*
  * We split each clone request in chunks that can fit into a single ZIL
  * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
  * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
  * us room for storing 1022 block pointers.
  *
  * On success, the function return the number of bytes copied in *lenp.
  * Note, it doesn't return how much bytes are left to be copied.
  * On errors which are caused by any file system limitations or
  * brt limitations `EINVAL` is returned. In the most cases a user
  * requested bad parameters, it could be possible to clone the file but
  * some parameters don't match the requirements.
  */
 int
 zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
     uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
 {
 	zfsvfs_t	*inzfsvfs, *outzfsvfs;
 	objset_t	*inos, *outos;
 	zfs_locked_range_t *inlr, *outlr;
 	dmu_buf_impl_t	*db;
 	dmu_tx_t	*tx;
 	zilog_t		*zilog;
 	uint64_t	inoff, outoff, len, done;
 	uint64_t	outsize, size;
 	int		error;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[3];
 	uint64_t	mtime[2], ctime[2];
 	uint64_t	uid, gid, projid;
 	blkptr_t	*bps;
 	size_t		maxblocks, nbps;
 	uint_t		inblksz;
 	uint64_t	clear_setid_bits_txg = 0;
 
 	inoff = *inoffp;
 	outoff = *outoffp;
 	len = *lenp;
 	done = 0;
 
 	inzfsvfs = ZTOZSB(inzp);
 	outzfsvfs = ZTOZSB(outzp);
 
 	/*
 	 * We need to call zfs_enter() potentially on two different datasets,
 	 * so we need a dedicated function for that.
 	 */
 	error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
 	if (error != 0)
 		return (error);
 
 	inos = inzfsvfs->z_os;
 	outos = outzfsvfs->z_os;
 
 	/*
 	 * Both source and destination have to belong to the same storage pool.
 	 */
 	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/*
 	 * outos and inos belongs to the same storage pool.
 	 * see a few lines above, only one check.
 	 */
 	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
 	    SPA_FEATURE_BLOCK_CLONING)) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	}
 
 	ASSERT(!outzfsvfs->z_replay);
 
 	/*
 	 * Block cloning from an unencrypted dataset into an encrypted
 	 * dataset and vice versa is not supported.
 	 */
 	if (inos->os_encrypted != outos->os_encrypted) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	error = zfs_verify_zp(inzp);
 	if (error == 0)
 		error = zfs_verify_zp(outzp);
 	if (error != 0) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * We don't copy source file's flags that's why we don't allow to clone
 	 * files that are in quarantine.
 	 */
 	if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EACCES));
 	}
 
 	if (inoff >= inzp->z_size) {
 		*lenp = 0;
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (0);
 	}
 	if (len > inzp->z_size - inoff) {
 		len = inzp->z_size - inoff;
 	}
 	if (len == 0) {
 		*lenp = 0;
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (0);
 	}
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(outzfsvfs)) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * If immutable or not appending then return EPERM.
 	 * Intentionally allow ZFS_READONLY through here.
 	 * See zfs_zaccess_common()
 	 */
 	if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
 		zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * No overlapping if we are cloning within the same file.
 	 */
 	if (inzp == outzp) {
 		if (inoff < outoff + len && outoff < inoff + len) {
 			zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	/*
 	 * Maintain predictable lock order.
 	 */
 	if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
 		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
 		    RL_READER);
 		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
 		    RL_WRITER);
 	} else {
 		outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
 		    RL_WRITER);
 		inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
 		    RL_READER);
 	}
 
 	inblksz = inzp->z_blksz;
 
 	/*
 	 * We cannot clone into files with different block size if we can't
 	 * grow it (block size is already bigger or more than one block).
 	 */
 	if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz ||
 	    outzp->z_size > inblksz)) {
 		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 
 	/*
 	 * Block size must be power-of-2 if destination offset != 0.
 	 * There can be no multiple blocks of non-power-of-2 size.
 	 */
 	if (outoff != 0 && !ISP2(inblksz)) {
 		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 
 	/*
 	 * Offsets and len must be at block boundries.
 	 */
 	if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
 		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 	/*
 	 * Length must be multipe of blksz, except for the end of the file.
 	 */
 	if ((len % inblksz) != 0 &&
 	    (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
 		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 
 	/*
 	 * If we are copying only one block and it is smaller than recordsize
 	 * property, do not allow destination to grow beyond one block if it
 	 * is not there yet.  Otherwise the destination will get stuck with
 	 * that block size forever, that can be as small as 512 bytes, no
 	 * matter how big the destination grow later.
 	 */
 	if (len <= inblksz && inblksz < outzfsvfs->z_max_blksz &&
 	    outzp->z_size <= inblksz && outoff + len > inblksz) {
 		error = SET_ERROR(EINVAL);
 		goto unlock;
 	}
 
 	error = zn_rlimit_fsize(outoff + len);
 	if (error != 0) {
 		goto unlock;
 	}
 
 	if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
 		error = SET_ERROR(EFBIG);
 		goto unlock;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
 	    &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
 	    &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
 	    &outzp->z_size, 8);
 
 	zilog = outzfsvfs->z_log;
 	maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
 	    sizeof (bps[0]);
 
 	uid = KUID_TO_SUID(ZTOUID(outzp));
 	gid = KGID_TO_SGID(ZTOGID(outzp));
 	projid = outzp->z_projid;
 
 	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
 
 	/*
 	 * Clone the file in reasonable size chunks.  Each chunk is cloned
 	 * in a separate transaction; this keeps the intent log records small
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (len > 0) {
 		size = MIN(inblksz * maxblocks, len);
 
 		if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
 		    uid) ||
 		    zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
 		    gid) ||
 		    (projid != ZFS_DEFAULT_PROJID &&
 		    zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
 		    projid))) {
 			error = SET_ERROR(EDQUOT);
 			break;
 		}
 
 		nbps = maxblocks;
 		error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps,
 		    &nbps);
 		if (error != 0) {
 			/*
 			 * If we are trying to clone a block that was created
 			 * in the current transaction group, error will be
 			 * EAGAIN here, which we can just return to the caller
 			 * so it can fallback if it likes.
 			 */
 			break;
 		}
 		/*
 		 * Encrypted data is fine as long as it comes from the same
 		 * dataset.
 		 * TODO: We want to extend it in the future to allow cloning to
 		 * datasets with the same keys, like clones or to be able to
 		 * clone a file from a snapshot of an encrypted dataset into the
 		 * dataset itself.
 		 */
 		if (BP_IS_PROTECTED(&bps[0])) {
 			if (inzfsvfs != outzfsvfs) {
 				error = SET_ERROR(EXDEV);
 				break;
 			}
 		}
 
 		/*
 		 * Start a transaction.
 		 */
 		tx = dmu_tx_create(outos);
 		dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
 		db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
 		DB_DNODE_ENTER(db);
 		dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
 		DB_DNODE_EXIT(db);
 		zfs_sa_upgrade_txholds(tx, outzp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 			break;
 		}
 
 		/*
 		 * Copy source znode's block size. This only happens on the
 		 * first iteration since zfs_rangelock_reduce() will shrink down
 		 * lr_len to the appropriate size.
 		 */
 		if (outlr->lr_length == UINT64_MAX) {
 			zfs_grow_blocksize(outzp, inblksz, tx);
 			/*
 			 * Round range lock up to the block boundary, so we
 			 * prevent appends until we are done.
 			 */
 			zfs_rangelock_reduce(outlr, outoff,
 			    ((len - 1) / inblksz + 1) * inblksz);
 		}
 
 		error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx,
-		    bps, nbps, B_FALSE);
+		    bps, nbps);
 		if (error != 0) {
 			dmu_tx_commit(tx);
 			break;
 		}
 
 		zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
 		    &clear_setid_bits_txg, tx);
 
 		zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
 
 		/*
 		 * Update the file size (zp_size) if it has changed;
 		 * account for possible concurrent updates.
 		 */
 		while ((outsize = outzp->z_size) < outoff + size) {
 			(void) atomic_cas_64(&outzp->z_size, outsize,
 			    outoff + size);
 		}
 
 		error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
 
 		zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
 		    size, inblksz, bps, nbps);
 
 		dmu_tx_commit(tx);
 
 		if (error != 0)
 			break;
 
 		inoff += size;
 		outoff += size;
 		len -= size;
 		done += size;
 	}
 
 	vmem_free(bps, sizeof (bps[0]) * maxblocks);
 	zfs_znode_update_vfs(outzp);
 
 unlock:
 	zfs_rangelock_exit(outlr);
 	zfs_rangelock_exit(inlr);
 
 	if (done > 0) {
 		/*
 		 * If we have made at least partial progress, reset the error.
 		 */
 		error = 0;
 
 		ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
 
 		if (outos->os_sync == ZFS_SYNC_ALWAYS) {
 			zil_commit(zilog, outzp->z_id);
 		}
 
 		*inoffp += done;
 		*outoffp += done;
 		*lenp = done;
 	} else {
 		/*
 		 * If we made no progress, there must be a good reason.
 		 * EOF is handled explicitly above, before the loop.
 		 */
 		ASSERT3S(error, !=, 0);
 	}
 
 	zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
  * but we cannot do that, because when replaying we don't have source znode
  * available. This is why we need a dedicated replay function.
  */
 int
 zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
     const blkptr_t *bps, size_t nbps)
 {
 	zfsvfs_t	*zfsvfs;
 	dmu_buf_impl_t	*db;
 	dmu_tx_t	*tx;
 	int		error;
 	int		count = 0;
 	sa_bulk_attr_t	bulk[3];
 	uint64_t	mtime[2], ctime[2];
 
 	ASSERT3U(off, <, MAXOFFSET_T);
 	ASSERT3U(len, >, 0);
 	ASSERT3U(nbps, >, 0);
 
 	zfsvfs = ZTOZSB(zp);
 
 	ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
 	    SPA_FEATURE_BLOCK_CLONING));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	ASSERT(zfsvfs->z_replay);
 	ASSERT(!zfs_is_readonly(zfsvfs));
 
 	if ((off % blksz) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &zp->z_size, 8);
 
 	/*
 	 * Start a transaction.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
 	DB_DNODE_ENTER(db);
 	dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
 	DB_DNODE_EXIT(db);
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zp->z_blksz < blksz)
 		zfs_grow_blocksize(zp, blksz, tx);
 
-	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+	dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps);
 
 	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 
 	if (zp->z_size < off + len)
 		zp->z_size = off + len;
 
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 
 	/*
 	 * zil_replaying() not only check if we are replaying ZIL, but also
 	 * updates the ZIL header to record replay progress.
 	 */
 	VERIFY(zil_replaying(zfsvfs->z_log, tx));
 
 	dmu_tx_commit(tx);
 
 	zfs_znode_update_vfs(zp);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 EXPORT_SYMBOL(zfs_access);
 EXPORT_SYMBOL(zfs_fsync);
 EXPORT_SYMBOL(zfs_holey);
 EXPORT_SYMBOL(zfs_read);
 EXPORT_SYMBOL(zfs_write);
 EXPORT_SYMBOL(zfs_getsecattr);
 EXPORT_SYMBOL(zfs_setsecattr);
 EXPORT_SYMBOL(zfs_clone_range);
 EXPORT_SYMBOL(zfs_clone_range_replay);
 
 ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
 	"Bytes to read per chunk");
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index b2699caa7589..7dfdaa081e8a 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -1,4222 +1,4242 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  * 	- a single, per-dataset, ZIL header; which points to a chain of
  * 	- zero or more ZIL blocks; each of which contains
  * 	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
  */
 static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
 static zil_sums_t zil_sums_global;
 static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
  * the disk(s) by the ZIL after an LWB write has completed. Setting this
  * will cause ZIL corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 static int
 zil_kstats_global_update(kstat_t *ksp, int rw)
 {
 	zil_kstat_values_t *zs = ksp->ks_data;
 	ASSERT3P(&zil_stats, ==, zs);
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	}
 
 	zil_kstat_values_update(zs, &zil_sums_global);
 
 	return (0);
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 void
 zil_sums_init(zil_sums_t *zs)
 {
 	wmsum_init(&zs->zil_commit_count, 0);
 	wmsum_init(&zs->zil_commit_writer_count, 0);
 	wmsum_init(&zs->zil_itx_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 	wmsum_init(&zs->zil_itx_copied_count, 0);
 	wmsum_init(&zs->zil_itx_copied_bytes, 0);
 	wmsum_init(&zs->zil_itx_needcopy_count, 0);
 	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 }
 
 void
 zil_sums_fini(zil_sums_t *zs)
 {
 	wmsum_fini(&zs->zil_commit_count);
 	wmsum_fini(&zs->zil_commit_writer_count);
 	wmsum_fini(&zs->zil_itx_count);
 	wmsum_fini(&zs->zil_itx_indirect_count);
 	wmsum_fini(&zs->zil_itx_indirect_bytes);
 	wmsum_fini(&zs->zil_itx_copied_count);
 	wmsum_fini(&zs->zil_itx_copied_bytes);
 	wmsum_fini(&zs->zil_itx_needcopy_count);
 	wmsum_fini(&zs->zil_itx_needcopy_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 }
 
 void
 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 {
 	zs->zil_commit_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_count);
 	zs->zil_commit_writer_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_writer_count);
 	zs->zil_itx_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_count);
 	zs->zil_itx_indirect_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_count);
 	zs->zil_itx_indirect_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 	zs->zil_itx_copied_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_count);
 	zs->zil_itx_copied_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
 	zs->zil_itx_needcopy_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
 	zs->zil_itx_needcopy_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 	zs->zil_itx_metaslab_normal_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 	zs->zil_itx_metaslab_normal_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 	zs->zil_itx_metaslab_slog_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 	zs->zil_itx_metaslab_slog_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *lrp, *end;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    &lrp, &end, &abuf);
 		if (error != 0) {
 			if (abuf)
 				arc_buf_destroy(abuf, &abuf);
 			if (claimed) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				dmu_objset_name(zilog->zl_os, name);
 
 				cmn_err(CE_WARN, "ZFS read log block error %d, "
 				    "dataset %s, seq 0x%llx\n", error, name,
 				    (u_longlong_t)blk_seq);
 			}
 			break;
 		}
 
 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
+			ASSERT3U(reclen, <=, end - lrp);
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	zil_bp_tree_fini(zilog);
 
 	return (error);
 }
 
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
 	if (bp->blk_birth >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
-	ASSERT(lrc->lrc_txtype == TX_WRITE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 static int
 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
-	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	/*
 	 * XXX: Do we need to byteswap lr?
 	 */
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
 		return (zil_claim_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
 	(void) claim_txg;
 
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
-	ASSERT(lrc->lrc_txtype == TX_WRITE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
 	return (0);
 }
 
 static int
 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
-	ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t claim_txg)
 {
 
 	if (claim_txg == 0) {
 		return (0);
 	}
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_free_write(zilog, lrc, tx, claim_txg));
 	case TX_CLONE_RANGE:
 		return (zil_free_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (TREE_CMP(v1, v2));
 }
 
 /*
  * Allocate a new lwb.  We may already have a block pointer for it, in which
  * case we get size and version from there.  Or we may not yet, in which case
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
     uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
 		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 		sz = BP_GET_LSIZE(bp);
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
 		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 		    SPA_VERSION_SLIM_ZIL);
 	}
 	lwb->lwb_slog = slog;
 	lwb->lwb_error = 0;
 	if (lwb->lwb_slim) {
 		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 	} else {
 		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	}
 	lwb->lwb_sz = sz;
 	lwb->lwb_state = state;
 	lwb->lwb_buf = zio_buf_alloc(sz);
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
 	lwb->lwb_alloc_txg = txg;
 	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	if (state != LWB_STATE_NEW)
 		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 static void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 static boolean_t __maybe_unused
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 static boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
  * zil_commit.
  */
 static void
 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 
 	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(ds, tx);
 		txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block pointer into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		/*
 		 * If "zilsaxattr" feature is enabled on zpool, then activate
 		 * it now when we're creating the ZIL chain. We can't wait with
 		 * this until we write the first xattr log record because we
 		 * need to wait for the feature activation to sync out.
 		 */
 		if (spa_feature_is_enabled(zilog->zl_spa,
 		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
 		    DMU_OST_ZVOL) {
 			mutex_enter(&ds->ds_lock);
 			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 			    (void *)B_TRUE;
 			mutex_exit(&ds->ds_lock);
 		}
 
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	} else {
 		/*
 		 * This branch covers the case where we enable the feature on a
 		 * zpool that has existing ZIL headers.
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 	}
 	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
 	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
 	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			if (!BP_IS_HOLE(&lwb->lwb_blk))
 				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 
 	return (B_TRUE);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg, B_FALSE);
 		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
 	 * zl_issuer_lock also protects leaving the open state.
 	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
 	 * flush_done, which transition is protected by zl_lock.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
 	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
 	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(nolwb, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
  * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
  * all "previous" lwb's will have completed before this function is
  * called; i.e. this function is called for all previous lwbs before
  * it's called for "this" lwb (enforced via zio the dependencies
  * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures
  * are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
 		 * order for this error handling to work correctly. This
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
 		 *
 		 * With that said, it's important to note that LWB flush
 		 * errors are not propagated up to the LWB root ZIO.
 		 * This is incorrect behavior, and results in VDEV flush
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
 
 	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
 		cv_broadcast(&zilog->zl_lwb_io_cv);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
  * Wait for the completion of all issued write/flush of that txg provided.
  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
  */
 static void
 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 {
 	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
 		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
 	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
 		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lock);
 #endif
 }
 
 /*
  * This is called when an lwb's write zio completes. The callback's
  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
  * in writing out this specific lwb's data, and in the case that cache
  * flushes have been deferred, vdevs involved in writing the data for
  * previous lwbs. The writes corresponding to all the vdevs in the
  * lwb_vdev_tree will have completed by the time this is called, due to
  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
  * which takes deferred flushes into account. The lwb will be "done"
  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
  * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 
 	/*
 	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
 	 * called for it yet, and when it will be, it won't be able to make
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
 		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
 	 * this point (e.g. setting "zcw_zio_error" appropriately), as
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
 	 * If this lwb does not have any threads waiting for it to
 	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
 	 * command to the vdevs written to by "this" lwb, and instead
 	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
 	 * command for those vdevs. Thus, we merge the vdev tree of
 	 * "this" lwb with the vdev tree of the "next" lwb in the list,
 	 * and assume the "next" lwb will handle flushing the vdevs (or
 	 * deferring the flush(s) again).
 	 *
 	 * This is a useful performance optimization, especially for
 	 * workloads with lots of async write activity and few sync
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
 			 * any errors when flushing the vdev(s), will
 			 * (unfortunately) not be handled correctly,
 			 * since these "zio_flush" errors will not be
 			 * propagated up to "zil_lwb_flush_vdevs_done".
 			 */
 			zio_flush(lwb->lwb_root_zio, vd);
 		}
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 /*
  * Build the zio dependency chain, which is used to preserve the ordering of
  * lwb completions that is required by the semantics of the ZIL. Each new lwb
  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
  * cannot complete until the previous lwb's zio completes.
  *
  * This is required by the semantics of zil_commit(): the commit waiters
  * attached to the lwbs will be woken in the lwb zio's completion callback,
  * so this zio dependency graph ensures the waiters are woken in the correct
  * order (the same order the lwbs were created).
  */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
 	if (prev_lwb == NULL ||
 	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
 	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 	 *
 	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
 	 * lwb will rely on this lwb to flush the vdevs written to by that
 	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
 	 * flush until after the previous lwb's write completes. We ensure
 	 * this ordering by setting the zio parent/child relationship here.
 	 *
 	 * Without this relationship on the lwb's write zio, it's possible
 	 * for this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
 	 * zil_lwb_write_done()).
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
 		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
 
 	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
 	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. This function is idempotent; if
  * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (lwb->lwb_state != LWB_STATE_NEW) {
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Define a limited set of intent log block sizes.
  *
  * These must be a multiple of 4KB. Note only the amount used (again
  * aligned to 4KB) actually gets written. However, we can't always just
  * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
  */
 static const struct {
 	uint64_t	limit;
 	uint64_t	blksz;
 } zil_block_buckets[] = {
 	{ 4096,		4096 },			/* non TX_WRITE */
 	{ 8192 + 4096,	8192 + 4096 },		/* database */
 	{ 32768 + 4096,	32768 + 4096 },		/* NFS writes */
 	{ 65536 + 4096,	65536 + 4096 },		/* 64KB writes */
 	{ 131072,	131072 },		/* < 128KB writes */
 	{ 131072 +4096,	65536 + 4096 },		/* 128KB writes */
 	{ UINT64_MAX,	SPA_OLD_MAXBLOCKSIZE},	/* > 128KB writes */
 };
 
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
  * zl_max_block_size instead.
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
 	 * If there was an allocation failure then returned NULL will trigger
 	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
 	 * since allocation may not have happened yet.
 	 */
 	if (lwb->lwb_error != 0)
 		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated. Here we select the size of the next
 	 * block, based on size used in the last block.
 	 * - first find the smallest bucket that will fit the block from a
 	 *   limited set of block sizes. This is because it's faster to write
 	 *   blocks allocated from the same metaslab as they are adjacent or
 	 *   close.
 	 * - next find the maximum from the new suggested size and an array of
 	 *   previous sizes. This lessens a picket fence effect of wrongly
 	 *   guessing the size if we have a stream of say 2k, 64k, 2k, 64k
 	 *   requests.
 	 *
 	 * Note we only write what is used, but we can't just allocate
 	 * the maximum block size because we can exhaust the available
 	 * pool log space.
 	 */
 	uint64_t zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
 	for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++)
 		continue;
 	zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size);
 	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
 	for (i = 0; i < ZIL_PREV_BLKS; i++)
 		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
 	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog,
 	    uint64_t, zil_blksz,
 	    uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]);
 	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	return (zil_alloc_lwb(zilog, zil_blksz, NULL, 0, 0, state));
 }
 
 /*
  * Finalize previously closed block and issue the write zio.
  */
 static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	spa_t *spa = zilog->zl_spa;
 	zil_chain_t *zilc;
 	boolean_t slog;
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 	int error;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/* Actually fill the lwb with the data. */
 	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 
 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * The lwb is now ready to be issued, but it can be only if it already
 	 * got its block pointer allocated or the allocation has failed.
 	 * Otherwise leave it as-is, relying on some other thread to issue it
 	 * after allocating its block pointer via calling zil_lwb_write_issue()
 	 * for the previous lwb(s) in the chain.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_READY;
 	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
 		mutex_exit(&zilog->zl_lock);
 		return;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
 	if (lwb->lwb_slim)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
 		if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
 		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
 		if (lwb->lwb_slim) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
 			ASSERT3S(wsz, <=, lwb->lwb_sz);
 			zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
 		zilc->zc_pad = 0;
 		zilc->zc_nused = lwb->lwb_nused;
 		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
 		/*
 		 * We can't write the lwb if there was an allocation failure,
 		 * so create a null zio instead just to maintain dependencies.
 		 */
 		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
 		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
 		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
 	if (lwb->lwb_child_zio)
 		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
 	 * Open transaction to allocate the next block pointer.
 	 */
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * Allocate next the block pointer unless we are already in error.
 	 */
 	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
 		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
 		    &slog);
 	}
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
 	/*
 	 * Reduce TXG open time by incrementing inflight counter and committing
 	 * the transaciton.  zil_sync() will wait for it to return to zero.
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb->lwb_issued_txg = txg;
 	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
 	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	dmu_tx_commit(tx);
 
 	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
 	 * We've completed all potentially blocking operations.  Update the
 	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
 		nlwb->lwb_slog = slog;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	} else {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
 	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,
 	 * it is on us to issue it and possibly following ones.
 	 */
 	lwb = nlwb;
 	if (lwb)
 		goto next_lwb;
 }
 
 /*
  * Maximum amount of data that can be put into single log block.
  */
 uint64_t
 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
 	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
 	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
  * single log block.  Below that it is a tradeoff of additional memory copy
  * and possibly worse log space efficiency vs additional range lock/unlock.
  */
 static uint_t zil_maxcopied = 7680;
 
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
 	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	return (MIN(max_data, zil_maxcopied));
 }
 
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
  * to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
 	itx_t *citx;
 	lr_t *lr, *clr;
 	lr_write_t *lrw;
 	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
+	reclen = lr->lrc_reclen;
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
+		ASSERT3U(reclen, ==, sizeof (lr_write_t));
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
+		ASSERT3U(reclen, >=, sizeof (lr_t));
 		dlen = 0;
 	}
-	reclen = lr->lrc_reclen;
+	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
 	zilog->zl_cur_used += (reclen + dlen);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
 		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
-
-		/*
-		 * There must be enough space in the new, empty log block to
-		 * hold reclen.  For WR_COPIED, we need to fit the whole
-		 * record in one block, and reclen is the header size + the
-		 * data size. For WR_NEED_COPY, we can create multiple
-		 * records, splitting the data into multiple blocks, so we
-		 * only need to fit one word of data per block; in this case
-		 * reclen is just the header size (no data).
-		 */
-		ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 	}
 
+	/*
+	 * There must be enough space in the log block to hold reclen.
+	 * For WR_COPIED, we need to fit the whole record in one block,
+	 * and reclen is the write record header size + the data size.
+	 * For WR_NEED_COPY, we can create multiple records, splitting
+	 * the data into multiple blocks, so we only need to fit one
+	 * word of data per block; in this case reclen is just the header
+	 * size (no data).
+	 */
+	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
+
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
 		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
 		citx = zil_itx_clone(itx);
 		clr = &citx->itx_lr;
 		lr_write_t *clrw = (lr_write_t *)clr;
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
 	if (dlen > 0) {
 		zilog->zl_cur_used += reclen;
 		goto cont;
 	}
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
 		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
 
 /*
  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
  * Does not require locking.
  */
 static void
 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 {
 	lr_t *lr, *lrb;
 	lr_write_t *lrw, *lrwb;
 	char *lr_buf;
 	uint64_t dlen, reclen;
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		dlen = P2ROUNDUP_TYPED(
 		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 	} else {
 		dlen = 0;
 	}
 	reclen = lr->lrc_reclen;
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
 	memcpy(lr_buf, lr, reclen);
 	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
 	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
 			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
 			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrb->lrc_reclen += dlen;
 				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
 				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
 				if (lwb->lwb_child_zio == NULL) {
 					lwb->lwb_child_zio = zio_null(NULL,
 					    zilog->zl_spa, NULL, NULL, NULL,
 					    ZIO_FLAG_CANFAIL);
 				}
 			}
 
 			/*
 			 * The "lwb_child_zio" we pass in will become a child of
 			 * "lwb_write_zio", when one is created, so one will be
 			 * a parent of any zio's created by the "zl_get_data".
 			 * This way "lwb_write_zio" will first wait for children
 			 * block pointers before own writing, and then for their
 			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
 				    dlen - lrwb->lr_length);
 			}
 
 			/*
 			 * Typically, the only return values we should see from
 			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
 			 *  EALREADY. However, it is also possible to see other
 			 *  error values such as ENOSPC or EINVAL from
 			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
 			 *  ENXIO as well as a multitude of others from the
 			 *  block layer through dmu_buf_hold() -> dbuf_read()
 			 *  -> zio_wait(), as well as through dmu_read() ->
 			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
 			 *  zio_wait(). When these errors happen, we can assume
 			 *  that neither an immediate write nor an indirect
 			 *  write occurred, so we need to fall back to
 			 *  txg_wait_synced(). This is unusual, so we print to
 			 *  dmesg whenever one of these errors occurs.
 			 */
 			switch (error) {
 			case 0:
 				break;
 			default:
 				cmn_err(CE_WARN, "zil_lwb_commit() received "
 				    "unexpected error %d from ->zl_get_data()"
 				    ". Falling back to txg_wait_synced().",
 				    error);
 				zfs_fallthrough;
 			case EIO:
 				txg_wait_synced(zilog->zl_dmu_pool,
 				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
 			case EEXIST:
 				zfs_fallthrough;
 			case EALREADY:
 				return;
 			}
 		}
 	}
 
 	lwb->lwb_nfilled += reclen + dlen;
 	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
 	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t olrsize)
 {
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
+	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
+	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
+	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(oitx->itx_size, ==,
+	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
+
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
+	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
+	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
+	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
 	if (itx->itx_callback != NULL)
 		itx->itx_callback(itx->itx_callback_data);
 
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (TREE_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
-		ian = avl_find(t, &oid, &where);
+		ian_search.ia_foid = oid;
+		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", (u_longlong_t)itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been committed) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
 			 * Allow all earlier itxs to be committed, but ask
 			 * caller to do txg_wait_synced(txg) for any new.
 			 */
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
 	}
 	return (wtxg);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
-	itx_async_node_t *ian;
+	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
-			ian = avl_find(t, &foid, &where);
+			ian_search.ia_foid = foid;
+			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
 	    zilog->zl_cur_used == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
 	zilog->zl_cur_used = 0;
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		/*
 		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * If the lwb is still opened, it means the workload is really
 		 * multi-threaded and we won the chance of write aggregation.
 		 * If it is not opened yet, but previous lwb is still not
 		 * flushed, it still means the workload is multi-threaded, but
 		 * there was too much time between the commits to aggregate, so
 		 * we try aggregation next times, but without too much hopes.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED) {
 			zilog->zl_parallel = ZIL_BURSTS;
 		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
 		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
 			zilog->zl_parallel = MAX(zilog->zl_parallel,
 			    ZIL_BURSTS / 2);
 		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. We move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occurring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
 				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
 				} else if ((zcw->zcw_lwb != NULL &&
 				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
 					/*
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
 					break;
 				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		while ((lwb = list_remove_head(ilwbs)) != NULL)
 			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in "new" or "opened" state.
 		 *
 		 * If it's "new", then no itxs have been committed to it, so
 		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
 		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occurring on
 		 * the system, such that this function will be immediately
 		 * called again by different thread and this lwb will be
 		 * closed by zil_lwb_assign().  This way, the lwb will be
 		 * "full" when it is issued to disk, and we'll make use of
 		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
 		 * the system, zil_commit_waiter() will close it and issue
 		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in its buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 			zil_burst_done(zilog);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
 				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been committed to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
 	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
 	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
 	return (wtxg);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's OPENED or CLOSED, and block any other threads that might
 	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	/*
 	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
 	 * is still open.  But we have to drop it to avoid a deadlock in case
 	 * callback of zio issued by zil_lwb_write_issue() try to get it,
 	 * while zil_lwb_write_issue() is blocked on attempt to issue next
 	 * lwb it found in LWB_STATE_READY state.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	zil_burst_done(zilog);
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 */
 		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 		mutex_exit(&zilog->zl_issuer_lock);
 	} else {
 		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
 	}
 	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			int rc = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (rc != -1 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signaled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_CLOSED ||
 			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter(void)
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signaled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signaled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
  *        that are sent to the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child dependency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the semantics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	} else if (wtxg != 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	zil_lwb_flush_wait_all(zilog, txg);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
 		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		} else {
 			/*
 			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
 			 * records. So, deactivate the feature for this dataset.
 			 * We activate it again when we start a new ZIL chain.
 			 */
 			if (dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_ZILSAXATTR))
 				dsl_dataset_deactivate_feature(ds,
 				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
 		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		if (!BP_IS_HOLE(&lwb->lwb_blk))
 			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_sums_init(&zil_sums_global);
 	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_kstats_global != NULL) {
 		zil_kstats_global->ks_data = &zil_stats;
 		zil_kstats_global->ks_update = zil_kstats_global_update;
 		zil_kstats_global->ks_private = NULL;
 		kstat_install(zil_kstats_global);
 	}
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_kstats_global != NULL) {
 		kstat_delete(zil_kstats_global);
 		zil_kstats_global = NULL;
 	}
 
 	zil_sums_fini(&zil_sums_global);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 	zilog->zl_max_block_size = zil_maxblocksize;
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		txg = MAX(txg, lwb->lwb_alloc_txg);
 		txg = MAX(txg, lwb->lwb_max_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
 	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait until that txg is synced.
 	 * zil_sync() will guarantee all lwbs up to that txg have been
 	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
 		    (u_longlong_t)txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted &&
 	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EACCES));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted)
 		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
     uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	(void) bp, (void) arg, (void) claim_txg;
 
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 
 	return (B_TRUE);
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 zil_reset(const char *osname, void *arg)
 {
 	(void) arg;
 
 	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
 EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 EXPORT_SYMBOL(zil_sums_init);
 EXPORT_SYMBOL(zil_sums_fini);
 EXPORT_SYMBOL(zil_kstat_values_update);
 
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
diff --git a/sys/contrib/openzfs/module/zfs/zio_checksum.c b/sys/contrib/openzfs/module/zfs/zio_checksum.c
index 9de515e8767a..e511b31fee6d 100644
--- a/sys/contrib/openzfs/module/zfs/zio_checksum.c
+++ b/sys/contrib/openzfs/module/zfs/zio_checksum.c
@@ -1,573 +1,577 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
 #include <sys/abd.h>
 #include <zfs_fletcher.h>
 
 /*
  * Checksum vectors.
  *
  * In the SPA, everything is checksummed.  We support checksum vectors
  * for three distinct reasons:
  *
  *   1. Different kinds of data need different levels of protection.
  *	For SPA metadata, we always want a very strong checksum.
  *	For user data, we let users make the trade-off between speed
  *	and checksum strength.
  *
  *   2. Cryptographic hash and MAC algorithms are an area of active research.
  *	It is likely that in future hash functions will be at least as strong
  *	as current best-of-breed, and may be substantially faster as well.
  *	We want the ability to take advantage of these new hashes as soon as
  *	they become available.
  *
  *   3. If someone develops hardware that can compute a strong hash quickly,
  *	we want the ability to take advantage of that hardware.
  *
  * Of course, we don't want a checksum upgrade to invalidate existing
  * data, so we store the checksum *function* in eight bits of the bp.
  * This gives us room for up to 256 different checksum functions.
  *
  * When writing a block, we always checksum it with the latest-and-greatest
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  *
  * SALTED CHECKSUMS
  *
  * To enable the use of less secure hash algorithms with dedup, we
  * introduce the notion of salted checksums (MACs, really).  A salted
  * checksum is fed both a random 256-bit value (the salt) and the data
  * to be checksummed.  This salt is kept secret (stored on the pool, but
  * never shown to the user).  Thus even if an attacker knew of collision
  * weaknesses in the hash algorithm, they won't be able to mount a known
  * plaintext attack on the DDT, since the actual hash value cannot be
  * known ahead of time.  How the salt is used is algorithm-specific
  * (some might simply prefix it to the data block, others might need to
  * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
  * object in the MOS (DMU_POOL_CHECKSUM_SALT).
  *
  * CONTEXT TEMPLATES
  *
  * Some hashing algorithms need to perform a substantial amount of
  * initialization work (e.g. salted checksums above may need to pre-hash
  * the salt) before being able to process data.  Performing this
  * redundant work for each block would be wasteful, so we instead allow
  * a checksum algorithm to do the work once (the first time it's used)
  * and then keep this pre-initialized context as a template inside the
  * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
  * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
  * construct and destruct the pre-initialized checksum context.  The
  * pre-initialized context is then reused during each checksum
  * invocation and passed to the checksum function.
  */
 
 static void
 abd_checksum_off(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) abd, (void) size, (void) ctx_template;
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 static void
 abd_fletcher_2_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_native, zcp);
 }
 
 static void
 abd_fletcher_2_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_init(zcp);
 	(void) abd_iterate_func(abd, 0, size,
 	    fletcher_2_incremental_byteswap, zcp);
 }
 
 static inline void
 abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp)
 {
 	fletcher_4_abd_ops.acf_init(acdp);
 	abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp);
 	fletcher_4_abd_ops.acf_fini(acdp);
 }
 
 void
 abd_fletcher_4_native(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
 		.acd_byteorder	= ZIO_CHECKSUM_NATIVE,
 		.acd_zcp 	= zcp,
 		.acd_ctx	= &ctx
 	};
 
 	abd_fletcher_4_impl(abd, size, &acd);
 
 }
 
 void
 abd_fletcher_4_byteswap(abd_t *abd, uint64_t size,
     const void *ctx_template, zio_cksum_t *zcp)
 {
 	(void) ctx_template;
 	fletcher_4_ctx_t ctx;
 
 	zio_abd_checksum_data_t acd = {
 		.acd_byteorder	= ZIO_CHECKSUM_BYTESWAP,
 		.acd_zcp 	= zcp,
 		.acd_ctx	= &ctx
 	};
 
 	abd_fletcher_4_impl(abd, size, &acd);
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
 	{{NULL, NULL}, NULL, NULL, 0, "on"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "off"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "label"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
 	    "gang_header"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
 	{{abd_fletcher_2_native,	abd_fletcher_2_byteswap},
 	    NULL, NULL, 0, "fletcher2"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
 	{{abd_checksum_sha256,		abd_checksum_sha256},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
 	{{abd_fletcher_4_native,	abd_fletcher_4_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
 	{{abd_checksum_off,		abd_checksum_off},
 	    NULL, NULL, 0, "noparity"},
 	{{abd_checksum_sha512_native,	abd_checksum_sha512_byteswap},
 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
 	{{abd_checksum_skein_native,	abd_checksum_skein_byteswap},
 	    abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
 	{{abd_checksum_edonr_native,	abd_checksum_edonr_byteswap},
 	    abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
 	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 	{{abd_checksum_blake3_native,	abd_checksum_blake3_byteswap},
 	    abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
 };
 
 /*
  * The flag corresponding to the "verify" in dedup=[checksum,]verify
  * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
  */
 spa_feature_t
 zio_checksum_to_feature(enum zio_checksum cksum)
 {
 	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
 
 	switch (cksum) {
 	case ZIO_CHECKSUM_BLAKE3:
 		return (SPA_FEATURE_BLAKE3);
 	case ZIO_CHECKSUM_SHA512:
 		return (SPA_FEATURE_SHA512);
 	case ZIO_CHECKSUM_SKEIN:
 		return (SPA_FEATURE_SKEIN);
 	case ZIO_CHECKSUM_EDONR:
 		return (SPA_FEATURE_EDONR);
 	default:
 		return (SPA_FEATURE_NONE);
 	}
 }
 
 enum zio_checksum
 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
 	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 
 	if (child == ZIO_CHECKSUM_INHERIT)
 		return (parent);
 
 	if (child == ZIO_CHECKSUM_ON)
 		return (ZIO_CHECKSUM_ON_VALUE);
 
 	return (child);
 }
 
 enum zio_checksum
 zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
     enum zio_checksum parent)
 {
 	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
 
 	if (child == ZIO_CHECKSUM_INHERIT)
 		return (parent);
 
 	if (child == ZIO_CHECKSUM_ON)
 		return (spa_dedup_checksum(spa));
 
 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 
 	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
 	    ZCHECKSUM_FLAG_DEDUP) ||
 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 
 	return (child);
 }
 
 /*
  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  * a tuple which is guaranteed to be unique for the life of the pool.
  */
 static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 
 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
 }
 
 /*
  * Set the external verifier for a label block based on its offset.
  * The vdev is implicit, and the txg is unknowable at pool open time --
  * hence the logic in vdev_uberblock_load() to find the most recent copy.
  */
 static void
 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 {
 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
 }
 
 /*
  * Calls the template init function of a checksum which supports context
  * templates and installs the template into the spa_t.
  */
 static void
 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 
 	if (ci->ci_tmpl_init == NULL)
 		return;
 	if (spa->spa_cksum_tmpls[checksum] != NULL)
 		return;
 
 	VERIFY(ci->ci_tmpl_free != NULL);
 	mutex_enter(&spa->spa_cksum_tmpls_lock);
 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
 		spa->spa_cksum_tmpls[checksum] =
 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
 		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
 	}
 	mutex_exit(&spa->spa_cksum_tmpls_lock);
 }
 
 /* convenience function to update a checksum to accommodate an encryption MAC */
 static void
 zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor)
 {
 	/*
 	 * Weak checksums do not have their entropy spread evenly
 	 * across the bits of the checksum. Therefore, when truncating
 	 * a weak checksum we XOR the first 2 words with the last 2 so
 	 * that we don't "lose" any entropy unnecessarily.
 	 */
 	if (xor) {
 		cksum->zc_word[0] ^= cksum->zc_word[2];
 		cksum->zc_word[1] ^= cksum->zc_word[3];
 	}
 
 	cksum->zc_word[2] = saved->zc_word[2];
 	cksum->zc_word[3] = saved->zc_word[3];
 }
 
 /*
  * Generate the checksum.
  */
 void
 zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     abd_t *abd, uint64_t size)
 {
 	static const uint64_t zec_magic = ZEC_MAGIC;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t cksum, saved;
 	spa_t *spa = zio->io_spa;
 	boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
 	zio_checksum_template_init(checksum, spa);
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t eck;
 		size_t eck_offset;
 
 		memset(&saved, 0, sizeof (zio_cksum_t));
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
-			size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ,
-			    uint64_t);
+			uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused,
+			    ZIL_MIN_BLKSZ, uint64_t);
+			ASSERT3U(size, >=, nused);
+			size = nused;
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck);
 		} else {
+			ASSERT3U(size, >=, sizeof (zio_eck_t));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER) {
 			zio_checksum_gang_verifier(&eck.zec_cksum, bp);
 		} else if (checksum == ZIO_CHECKSUM_LABEL) {
 			zio_checksum_label_verifier(&eck.zec_cksum, offset);
 		} else {
 			saved = eck.zec_cksum;
 			eck.zec_cksum = bp->blk_cksum;
 		}
 
 		abd_copy_from_buf_off(abd, &zec_magic,
 		    eck_offset + offsetof(zio_eck_t, zec_magic),
 		    sizeof (zec_magic));
 		abd_copy_from_buf_off(abd, &eck.zec_cksum,
 		    eck_offset + offsetof(zio_eck_t, zec_cksum),
 		    sizeof (zio_cksum_t));
 
 		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		if (bp != NULL && BP_USES_CRYPT(bp) &&
 		    BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			zio_checksum_handle_crypt(&cksum, &saved, insecure);
 
 		abd_copy_from_buf_off(abd, &cksum,
 		    eck_offset + offsetof(zio_eck_t, zec_cksum),
 		    sizeof (zio_cksum_t));
 	} else {
 		saved = bp->blk_cksum;
 		ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum],
 		    &cksum);
 		if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
 			zio_checksum_handle_crypt(&cksum, &saved, insecure);
 		bp->blk_cksum = cksum;
 	}
 }
 
 int
 zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp,
     enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset,
     zio_bad_cksum_t *info)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum;
 	zio_eck_t eck;
 	int byteswap;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (SET_ERROR(EINVAL));
 
 	zio_checksum_template_init(checksum, spa);
 
 	IMPLY(bp == NULL, ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED);
 	IMPLY(bp == NULL, checksum == ZIO_CHECKSUM_LABEL);
 
 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_cksum_t verifier;
 		size_t eck_offset;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t zilc;
 			uint64_t nused;
 
 			abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t));
 
 			eck = zilc.zc_eck;
 			eck_offset = offsetof(zil_chain_t, zc_eck) +
 			    offsetof(zio_eck_t, zec_cksum);
 
 			if (eck.zec_magic == ZEC_MAGIC) {
 				nused = zilc.zc_nused;
 			} else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) {
 				nused = BSWAP_64(zilc.zc_nused);
 			} else {
 				return (SET_ERROR(ECKSUM));
 			}
 
-			if (nused > size) {
+			nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			if (size < nused)
 				return (SET_ERROR(ECKSUM));
-			}
-
-			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+			size = nused;
 		} else {
+			if (size < sizeof (zio_eck_t))
+				return (SET_ERROR(ECKSUM));
 			eck_offset = size - sizeof (zio_eck_t);
 			abd_copy_to_buf_off(&eck, abd, eck_offset,
 			    sizeof (zio_eck_t));
 			eck_offset += offsetof(zio_eck_t, zec_cksum);
 		}
 
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
 			zio_checksum_label_verifier(&verifier, offset);
 		else
 			verifier = bp->blk_cksum;
 
 		byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC));
 
 		if (byteswap)
 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 		expected_cksum = eck.zec_cksum;
 
 		abd_copy_from_buf_off(abd, &verifier, eck_offset,
 		    sizeof (zio_cksum_t));
 
 		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 
 		abd_copy_from_buf_off(abd, &expected_cksum, eck_offset,
 		    sizeof (zio_cksum_t));
 
 		if (byteswap) {
 			byteswap_uint64_array(&expected_cksum,
 			    sizeof (zio_cksum_t));
 		}
 	} else {
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
 		ci->ci_func[byteswap](abd, size,
 		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
 	/*
 	 * MAC checksums are a special case since half of this checksum will
 	 * actually be the encryption MAC. This will be verified by the
 	 * decryption process, so we just check the truncated checksum now.
 	 * Objset blocks use embedded MACs so we don't truncate the checksum
 	 * for them.
 	 */
 	if (bp != NULL && BP_USES_CRYPT(bp) &&
 	    BP_GET_TYPE(bp) != DMU_OT_OBJSET) {
 		if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) {
 			actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2];
 			actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3];
 		}
 
 		actual_cksum.zc_word[2] = 0;
 		actual_cksum.zc_word[3] = 0;
 		expected_cksum.zc_word[2] = 0;
 		expected_cksum.zc_word[3] = 0;
 	}
 
 	if (info != NULL) {
 		info->zbc_checksum_name = ci->ci_name;
 		info->zbc_byteswapped = byteswap;
 		info->zbc_injected = 0;
 		info->zbc_has_cksum = 1;
 	}
 
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (SET_ERROR(ECKSUM));
 
 	return (0);
 }
 
 int
 zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 	int error;
 	uint64_t size = (bp == NULL ? zio->io_size :
 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 	uint64_t offset = zio->io_offset;
 	abd_t *data = zio->io_abd;
 	spa_t *spa = zio->io_spa;
 
 	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
 	    offset, info);
 
 	if (zio_injection_enabled && error == 0 && zio->io_error == 0) {
 		error = zio_handle_fault_injection(zio, ECKSUM);
 		if (error != 0)
 			info->zbc_injected = 1;
 	}
 
 	return (error);
 }
 
 /*
  * Called by a spa_t that's about to be deallocated. This steps through
  * all of the checksum context templates and deallocates any that were
  * initialized using the algorithm-specific template init function.
  */
 void
 zio_checksum_templates_free(spa_t *spa)
 {
 	for (enum zio_checksum checksum = 0;
 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 
 			VERIFY(ci->ci_tmpl_free != NULL);
 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
 			spa->spa_cksum_tmpls[checksum] = NULL;
 		}
 	}
 }
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index ac373379e43f..ce5b75462349 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -1,1748 +1,1756 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  *
  * ZFS volume emulation driver.
  *
  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  * Volumes are accessed through the symbolic links named:
  *
  * /dev/<pool_name>/<dataset_name>
  *
  * Volumes are persistent through reboot and module load.  No user command
  * needs to be run before opening and using a device.
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  */
 
 /*
  * Note on locking of zvol state structures.
  *
  * These structures are used to maintain internal state used to emulate block
  * devices on top of zvols. In particular, management of device minor number
  * operations - create, remove, rename, and set_snapdev - involves access to
  * these structures. The zvol_state_lock is primarily used to protect the
  * zvol_state_list. The zv->zv_state_lock is used to protect the contents
  * of the zvol_state_t structures, as well as to make sure that when the
  * time comes to remove the structure from the list, it is not in use, and
  * therefore, it can be taken off zvol_state_list and freed.
  *
  * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol,
  * e.g. for the duration of receive and rollback operations. This lock can be
  * held for significant periods of time. Given that it is undesirable to hold
  * mutexes for long periods of time, the following lock ordering applies:
  * - take zvol_state_lock if necessary, to protect zvol_state_list
  * - take zv_suspend_lock if necessary, by the code path in question
  * - take zv_state_lock to protect zvol_state_t
  *
  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
  * single-threaded (to preserve order of minor operations), and are executed
  * through the zvol_task_cb that dispatches the specific operations. Therefore,
  * these operations are serialized per pool. Consequently, we can be certain
  * that for a given zvol, there is only one operation at a time in progress.
  * That is why one can be sure that first, zvol_state_t for a given zvol is
  * allocated and placed on zvol_state_list, and then other minor operations
  * for this zvol are going to proceed in the order of issue.
  *
  */
 
 #include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 
 struct hlist_head *zvol_htable;
 static list_t zvol_state_list;
 krwlock_t zvol_state_lock;
 
 typedef enum {
 	ZVOL_ASYNC_REMOVE_MINORS,
 	ZVOL_ASYNC_RENAME_MINORS,
 	ZVOL_ASYNC_SET_SNAPDEV,
 	ZVOL_ASYNC_SET_VOLMODE,
 	ZVOL_ASYNC_MAX
 } zvol_async_op_t;
 
 typedef struct {
 	zvol_async_op_t op;
 	char name1[MAXNAMELEN];
 	char name2[MAXNAMELEN];
 	uint64_t value;
 } zvol_task_t;
 
 uint64_t
 zvol_name_hash(const char *name)
 {
 	uint64_t crc = -1ULL;
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
 	return (crc);
 }
 
 /*
  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 zvol_state_t *
 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 {
 	zvol_state_t *zv;
 	struct hlist_node *p = NULL;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
 		zv = hlist_entry(p, zvol_state_t, zv_hlink);
 		mutex_enter(&zv->zv_state_lock);
 		if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) {
 			/*
 			 * this is the right zvol, take the locks in the
 			 * right order
 			 */
 			if (mode != RW_NONE &&
 			    !rw_tryenter(&zv->zv_suspend_lock, mode)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_enter(&zv->zv_suspend_lock, mode);
 				mutex_enter(&zv->zv_state_lock);
 				/*
 				 * zvol cannot be renamed as we continue
 				 * to hold zvol_state_lock
 				 */
 				ASSERT(zv->zv_hash == hash &&
 				    strcmp(zv->zv_name, name) == 0);
 			}
 			rw_exit(&zvol_state_lock);
 			return (zv);
 		}
 		mutex_exit(&zv->zv_state_lock);
 	}
 	rw_exit(&zvol_state_lock);
 
 	return (NULL);
 }
 
 /*
  * Find a zvol_state_t given the name.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
 zvol_find_by_name(const char *name, int mode)
 {
 	return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
 }
 
 /*
  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  */
 void
 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 	nvlist_t *nvprops = zct->zct_props;
 	int error;
 	uint64_t volblocksize, volsize;
 
 	VERIFY(nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0);
 	if (nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 
 	/*
 	 * These properties must be removed from the list so the generic
 	 * property setting step won't apply to them.
 	 */
 	VERIFY(nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0);
 	(void) nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 
 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT(error == 0);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 	ASSERT(error == 0);
 }
 
 /*
  * ZFS_IOC_OBJSET_STATS entry point.
  */
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
 	int error;
 	dmu_object_info_t *doi;
 	uint64_t val;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (SET_ERROR(error));
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 
 	if (error == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 		    doi->doi_data_block_size);
 	}
 
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Sanity check volume size.
  */
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 {
 	if (volsize == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (volsize % blocksize != 0)
 		return (SET_ERROR(EINVAL));
 
 #ifdef _ILP32
 	if (volsize - 1 > SPEC_MAXOFFSET_T)
 		return (SET_ERROR(EOVERFLOW));
 #endif
 	return (0);
 }
 
 /*
  * Ensure the zap is flushed then inform the VFS of the capacity change.
  */
 static int
 zvol_update_volsize(uint64_t volsize, objset_t *os)
 {
 	dmu_tx_t *tx;
 	int error;
 	uint64_t txg;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (SET_ERROR(error));
 	}
 	txg = dmu_tx_get_txg(tx);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	txg_wait_synced(dmu_objset_pool(os), txg);
 
 	if (error == 0)
 		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
 
 	return (error);
 }
 
 /*
  * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
  * size will result in a udev "change" event being generated.
  */
 int
 zvol_set_volsize(const char *name, uint64_t volsize)
 {
 	objset_t *os = NULL;
 	uint64_t readonly;
 	int error;
 	boolean_t owned = B_FALSE;
 
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	if (readonly)
 		return (SET_ERROR(EROFS));
 
 	zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
 
 	ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_READ_HELD(&zv->zv_suspend_lock)));
 
 	if (zv == NULL || zv->zv_objset == NULL) {
 		if (zv != NULL)
 			rw_exit(&zv->zv_suspend_lock);
 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
 		    FTAG, &os)) != 0) {
 			if (zv != NULL)
 				mutex_exit(&zv->zv_state_lock);
 			return (SET_ERROR(error));
 		}
 		owned = B_TRUE;
 		if (zv != NULL)
 			zv->zv_objset = os;
 	} else {
 		os = zv->zv_objset;
 	}
 
 	dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
 
 	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
 	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
 		goto out;
 
 	error = zvol_update_volsize(volsize, os);
 	if (error == 0 && zv != NULL) {
 		zv->zv_volsize = volsize;
 		zv->zv_changed = 1;
 	}
 out:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	if (owned) {
 		dmu_objset_disown(os, B_TRUE, FTAG);
 		if (zv != NULL)
 			zv->zv_objset = NULL;
 	} else {
 		rw_exit(&zv->zv_suspend_lock);
 	}
 
 	if (zv != NULL)
 		mutex_exit(&zv->zv_state_lock);
 
 	if (error == 0 && zv != NULL)
 		zvol_os_update_volsize(zv, volsize);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * Update volthreading.
  */
 int
 zvol_set_volthreading(const char *name, boolean_t value)
 {
 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL)
 		return (ENOENT);
 	zv->zv_threading = value;
 	mutex_exit(&zv->zv_state_lock);
 	return (0);
 }
 
 /*
  * Update zvol ro property.
  */
 int
 zvol_set_ro(const char *name, boolean_t value)
 {
 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL)
 		return (-1);
 	if (value) {
 		zvol_os_set_disk_ro(zv, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
 		zvol_os_set_disk_ro(zv, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	mutex_exit(&zv->zv_state_lock);
 	return (0);
 }
 
 /*
  * Sanity check volume block size.
  */
 int
 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 {
 	/* Record sizes above 128k need the feature to be enabled */
 	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
 		spa_t *spa;
 		int error;
 
 		if ((error = spa_open(name, &spa, FTAG)) != 0)
 			return (error);
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 
 		/*
 		 * We don't allow setting the property above 1MB,
 		 * unless the tunable has been changed.
 		 */
 		if (volblocksize > zfs_max_recordsize)
 			return (SET_ERROR(EDOM));
 
 		spa_close(spa, FTAG);
 	}
 
 	if (volblocksize < SPA_MINBLOCKSIZE ||
 	    volblocksize > SPA_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
 		return (SET_ERROR(EDOM));
 
 	return (0);
 }
 
 /*
  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
  * implement DKIOCFREE/free-long-range.
  */
 static int
 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_truncate_t *lr = arg2;
 	uint64_t offset, length;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	int error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
 		    length);
 	}
 
 	return (error);
 }
 
 /*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_write_t *lr = arg2;
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 	}
 
 	return (error);
 }
 
 /*
  * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
  * after a system failure.
  *
  * TODO: For now we drop block cloning transations for ZVOLs as they are
  *       unsupported, but we still need to inform BRT about that as we
  *       claimed them during pool import.
  *       This situation can occur when we try to import a pool from a ZFS
  *       version supporting block cloning for ZVOLs into a system that
  *       has this ZFS version, that doesn't support block cloning for ZVOLs.
  */
 static int
 zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	zvol_state_t *zv = arg1;
 	objset_t *os = zv->zv_objset;
 	lr_clone_range_t *lr = arg2;
 	blkptr_t *bp;
 	dmu_tx_t *tx;
 	spa_t *spa;
 	uint_t ii;
 	int error;
 
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
+	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
+	    lr_bps[lr->lr_nbps]));
+
 	dmu_objset_name(os, name);
 	cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
 	    name);
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	tx = dmu_tx_create(os);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	spa = os->os_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	(void) zil_replaying(zv->zv_zilog, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
 	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * Callback vectors for replaying records.
  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* no such transaction type */
 	zvol_replay_err,	/* TX_CREATE */
 	zvol_replay_err,	/* TX_MKDIR */
 	zvol_replay_err,	/* TX_MKXATTR */
 	zvol_replay_err,	/* TX_SYMLINK */
 	zvol_replay_err,	/* TX_REMOVE */
 	zvol_replay_err,	/* TX_RMDIR */
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
 	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ATTR */
 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL */
 	zvol_replay_err,	/* TX_MKDIR_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 	zvol_replay_err,	/* TX_WRITE2 */
 	zvol_replay_err,	/* TX_SETSAXATTR */
 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
 	zvol_replay_clone_range	/* TX_CLONE_RANGE */
 };
 
 /*
  * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions.
  *
  * We store data in the log buffers if it's small enough.
  * Otherwise we will later flush the data out via dmu_sync().
  */
 static const ssize_t zvol_immediate_write_sz = 32768;
 
 void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
     uint64_t size, boolean_t commit)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	itx_wr_state_t write_state;
 	uint64_t sz = size;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 		write_state = WR_INDIRECT;
 	else if (!spa_has_slogs(zilog->zl_spa) &&
 	    size >= blocksize && blocksize > zvol_immediate_write_sz)
 		write_state = WR_INDIRECT;
 	else if (commit)
 		write_state = WR_COPIED;
 	else
 		write_state = WR_NEED_COPY;
 
 	while (size) {
 		itx_t *itx;
 		lr_write_t *lr;
 		itx_wr_state_t wr_state = write_state;
 		ssize_t len = size;
 
 		if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
 			wr_state = WR_NEED_COPY;
 		else if (wr_state == WR_INDIRECT)
 			len = MIN(blocksize - P2PHASE(offset, blocksize), size);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn,
 		    offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			wr_state = WR_NEED_COPY;
 		}
 
 		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = offset;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
 
 		(void) zil_itx_assign(zilog, itx, tx);
 
 		offset += len;
 		size -= len;
 	}
 
 	if (write_state == WR_COPIED || write_state == WR_NEED_COPY) {
 		dsl_pool_wrlog_count(zilog->zl_dmu_pool, sz, tx->tx_txg);
 	}
 }
 
 /*
  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  */
 void
 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 	zilog_t *zilog = zv->zv_zilog;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = ZVOL_OBJ;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 
 static void
 zvol_get_done(zgd_t *zgd, int error)
 {
 	(void) error;
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_rangelock_exit(zgd->zgd_lr);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
     struct lwb *lwb, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH);
 	} else { /* indirect write */
 		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
 		 * the data. Contrarily to zfs_get_data we need not re-check
 		 * blocksize after we get the lock because it cannot be changed.
 		 */
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
 		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
 		    &db);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db != NULL);
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zvol_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	zvol_get_done(zgd, error);
 
 	return (SET_ERROR(error));
 }
 
 /*
  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  */
 
 void
 zvol_insert(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 	list_insert_head(&zvol_state_list, zv);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 }
 
 /*
  * Simply remove the zvol from to list of zvols.
  */
 static void
 zvol_remove(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 	list_remove(&zvol_state_list, zv);
 	hlist_del(&zv->zv_hlink);
 }
 
 /*
  * Setup zv after we just own the zv->objset
  */
 static int
 zvol_setup_zv(zvol_state_t *zv)
 {
 	uint64_t volsize;
 	int error;
 	uint64_t ro;
 	objset_t *os = zv->zv_objset;
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	zv->zv_zilog = NULL;
 	zv->zv_flags &= ~ZVOL_WRITTEN_TO;
 
 	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
 	if (error)
 		return (SET_ERROR(error));
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		return (SET_ERROR(error));
 
 	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
 	if (error)
 		return (SET_ERROR(error));
 
 	zvol_os_set_capacity(zv, volsize >> 9);
 	zv->zv_volsize = volsize;
 
 	if (ro || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		zvol_os_set_disk_ro(zv, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
 		zvol_os_set_disk_ro(zv, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	return (0);
 }
 
 /*
  * Shutdown every zv_objset related stuff except zv_objset itself.
  * The is the reverse of zvol_setup_zv.
  */
 static void
 zvol_shutdown_zv(zvol_state_t *zv)
 {
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	if (zv->zv_flags & ZVOL_WRITTEN_TO) {
 		ASSERT(zv->zv_zilog != NULL);
 		zil_close(zv->zv_zilog);
 	}
 
 	zv->zv_zilog = NULL;
 
 	dnode_rele(zv->zv_dn, zv);
 	zv->zv_dn = NULL;
 
 	/*
 	 * Evict cached data. We must write out any dirty data before
 	 * disowning the dataset.
 	 */
 	if (zv->zv_flags & ZVOL_WRITTEN_TO)
 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	(void) dmu_objset_evict_dbufs(zv->zv_objset);
 }
 
 /*
  * return the proper tag for rollback and recv
  */
 void *
 zvol_tag(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 	return (zv->zv_open_count > 0 ? zv : NULL);
 }
 
 /*
  * Suspend the zvol for recv and rollback.
  */
 zvol_state_t *
 zvol_suspend(const char *name)
 {
 	zvol_state_t *zv;
 
 	zv = zvol_find_by_name(name, RW_WRITER);
 
 	if (zv == NULL)
 		return (NULL);
 
 	/* block all I/O, release in zvol_resume. */
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	atomic_inc(&zv->zv_suspend_ref);
 
 	if (zv->zv_open_count > 0)
 		zvol_shutdown_zv(zv);
 
 	/*
 	 * do not hold zv_state_lock across suspend/resume to
 	 * avoid locking up zvol lookups
 	 */
 	mutex_exit(&zv->zv_state_lock);
 
 	/* zv_suspend_lock is released in zvol_resume() */
 	return (zv);
 }
 
 int
 zvol_resume(zvol_state_t *zv)
 {
 	int error = 0;
 
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	mutex_enter(&zv->zv_state_lock);
 
 	if (zv->zv_open_count > 0) {
 		VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
 		VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
 		VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
 		dmu_objset_rele(zv->zv_objset, zv);
 
 		error = zvol_setup_zv(zv);
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 
 	rw_exit(&zv->zv_suspend_lock);
 	/*
 	 * We need this because we don't hold zvol_state_lock while releasing
 	 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
 	 * zv_suspend_lock to determine it is safe to free because rwlock is
 	 * not inherent atomic.
 	 */
 	atomic_dec(&zv->zv_suspend_ref);
 
 	return (SET_ERROR(error));
 }
 
 int
 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 {
 	objset_t *os;
 	int error;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(mutex_owned(&spa_namespace_lock));
 
 	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 	if (error)
 		return (SET_ERROR(error));
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
 	return (error);
 }
 
 void
 zvol_last_close(zvol_state_t *zv)
 {
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	zvol_shutdown_zv(zv);
 
 	dmu_objset_disown(zv->zv_objset, 1, zv);
 	zv->zv_objset = NULL;
 }
 
 typedef struct minors_job {
 	list_t *list;
 	list_node_t link;
 	/* input */
 	char *name;
 	/* output */
 	int error;
 } minors_job_t;
 
 /*
  * Prefetch zvol dnodes for the minors_job
  */
 static void
 zvol_prefetch_minors_impl(void *arg)
 {
 	minors_job_t *job = arg;
 	char *dsname = job->name;
 	objset_t *os = NULL;
 
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
 		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_snap_minor_cb(const char *dsname, void *arg)
 {
 	minors_job_t *j = arg;
 	list_t *minors_list = j->list;
 	const char *name = j->name;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	/* skip the designated dataset */
 	if (name && strcmp(dsname, name) == 0)
 		return (0);
 
 	/* at this point, the dsname should name a snapshot */
 	if (strchr(dsname, '@') == 0) {
 		dprintf("zvol_create_snap_minor_cb(): "
 		    "%s is not a snapshot name\n", dsname);
 	} else {
 		minors_job_t *job;
 		char *n = kmem_strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 	}
 
 	return (0);
 }
 
 /*
  * If spa_keystore_load_wkey() is called for an encrypted zvol,
  * we need to look for any clones also using the key. This function
  * is "best effort" - so we just skip over it if there are failures.
  */
 static void
 zvol_add_clones(const char *dsname, list_t *minors_list)
 {
 	/* Also check if it has clones */
 	dsl_dir_t *dd = NULL;
 	dsl_pool_t *dp = NULL;
 
 	if (dsl_pool_hold(dsname, FTAG, &dp) != 0)
 		return;
 
 	if (!spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_ENCRYPTION))
 		goto out;
 
 	if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0)
 		goto out;
 
 	if (dsl_dir_phys(dd)->dd_clones == 0)
 		goto out;
 
 	zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 
 	for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		dsl_dataset_t *clone;
 		minors_job_t *job;
 
 		if (dsl_dataset_hold_obj(dd->dd_pool,
 		    za->za_first_integer, FTAG, &clone) == 0) {
 
 			char name[ZFS_MAX_DATASET_NAME_LEN];
 			dsl_dataset_name(clone, name);
 
 			char *n = kmem_strdup(name);
 			job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 			job->name = n;
 			job->list = minors_list;
 			job->error = 0;
 			list_insert_tail(minors_list, job);
 
 			dsl_dataset_rele(clone, FTAG);
 		}
 	}
 	zap_cursor_fini(zc);
 	kmem_free(za, sizeof (zap_attribute_t));
 	kmem_free(zc, sizeof (zap_cursor_t));
 
 out:
 	if (dd != NULL)
 		dsl_dir_rele(dd, FTAG);
 	dsl_pool_rele(dp, FTAG);
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_minors_cb(const char *dsname, void *arg)
 {
 	uint64_t snapdev;
 	int error;
 	list_t *minors_list = arg;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 	if (error)
 		return (0);
 
 	/*
 	 * Given the name and the 'snapdev' property, create device minor nodes
 	 * with the linkages to zvols/snapshots as needed.
 	 * If the name represents a zvol, create a minor node for the zvol, then
 	 * check if its snapshots are 'visible', and if so, iterate over the
 	 * snapshots and create device minor nodes for those.
 	 */
 	if (strchr(dsname, '@') == 0) {
 		minors_job_t *job;
 		char *n = kmem_strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 
 		zvol_add_clones(dsname, minors_list);
 
 		if (snapdev == ZFS_SNAPDEV_VISIBLE) {
 			/*
 			 * traverse snapshots only, do not traverse children,
 			 * and skip the 'dsname'
 			 */
 			(void) dmu_objset_find(dsname,
 			    zvol_create_snap_minor_cb, (void *)job,
 			    DS_FIND_SNAPSHOTS);
 		}
 	} else {
 		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
 		    dsname);
 	}
 
 	return (0);
 }
 
 /*
  * Create minors for the specified dataset, including children and snapshots.
  * Pay attention to the 'snapdev' property and iterate over the snapshots
  * only if they are 'visible'. This approach allows one to assure that the
  * snapshot metadata is read from disk only if it is needed.
  *
  * The name can represent a dataset to be recursively scanned for zvols and
  * their snapshots, or a single zvol snapshot. If the name represents a
  * dataset, the scan is performed in two nested stages:
  * - scan the dataset for zvols, and
  * - for each zvol, create a minor node, then check if the zvol's snapshots
  *   are 'visible', and only then iterate over the snapshots if needed
  *
  * If the name represents a snapshot, a check is performed if the snapshot is
  * 'visible' (which also verifies that the parent is a zvol), and if so,
  * a minor node for that snapshot is created.
  */
 void
 zvol_create_minors_recursive(const char *name)
 {
 	list_t minors_list;
 	minors_job_t *job;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	/*
 	 * This is the list for prefetch jobs. Whenever we found a match
 	 * during dmu_objset_find, we insert a minors_job to the list and do
 	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
 	 * any lock because all list operation is done on the current thread.
 	 *
 	 * We will use this list to do zvol_os_create_minor after prefetch
 	 * so we don't have to traverse using dmu_objset_find again.
 	 */
 	list_create(&minors_list, sizeof (minors_job_t),
 	    offsetof(minors_job_t, link));
 
 
 	if (strchr(name, '@') != NULL) {
 		uint64_t snapdev;
 
 		int error = dsl_prop_get_integer(name, "snapdev",
 		    &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 			(void) zvol_os_create_minor(name);
 	} else {
 		fstrans_cookie_t cookie = spl_fstrans_mark();
 		(void) dmu_objset_find(name, zvol_create_minors_cb,
 		    &minors_list, DS_FIND_CHILDREN);
 		spl_fstrans_unmark(cookie);
 	}
 
 	taskq_wait_outstanding(system_taskq, 0);
 
 	/*
 	 * Prefetch is completed, we can do zvol_os_create_minor
 	 * sequentially.
 	 */
 	while ((job = list_remove_head(&minors_list)) != NULL) {
 		if (!job->error)
 			(void) zvol_os_create_minor(job->name);
 		kmem_strfree(job->name);
 		kmem_free(job, sizeof (minors_job_t));
 	}
 
 	list_destroy(&minors_list);
 }
 
 void
 zvol_create_minor(const char *name)
 {
 	/*
 	 * Note: the dsl_pool_config_lock must not be held.
 	 * Minor node creation needs to obtain the zvol_state_lock.
 	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
 	 * config lock.  Therefore, we can't have the config lock now if
 	 * we are going to wait for the zvol_state_lock, because it
 	 * would be a lock order inversion which could lead to deadlock.
 	 */
 
 	if (zvol_inhibit_dev)
 		return;
 
 	if (strchr(name, '@') != NULL) {
 		uint64_t snapdev;
 
 		int error = dsl_prop_get_integer(name,
 		    "snapdev", &snapdev, NULL);
 
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE)
 			(void) zvol_os_create_minor(name);
 	} else {
 		(void) zvol_os_create_minor(name);
 	}
 }
 
 /*
  * Remove minors for specified dataset including children and snapshots.
  */
 
 static void
 zvol_free_task(void *arg)
 {
 	zvol_os_free(arg);
 }
 
 void
 zvol_remove_minors_impl(const char *name)
 {
 	zvol_state_t *zv, *zv_next;
 	int namelen = ((name) ? strlen(name) : 0);
 	taskqid_t t;
 	list_t free_list;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	list_create(&free_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 
 	rw_enter(&zvol_state_lock, RW_WRITER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
 		    (strncmp(zv->zv_name, name, namelen) == 0 &&
 		    (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
 			/*
 			 * By holding zv_state_lock here, we guarantee that no
 			 * one is currently using this zv
 			 */
 
 			/* If in use, leave alone */
 			if (zv->zv_open_count > 0 ||
 			    atomic_read(&zv->zv_suspend_ref)) {
 				mutex_exit(&zv->zv_state_lock);
 				continue;
 			}
 
 			zvol_remove(zv);
 
 			/*
 			 * Cleared while holding zvol_state_lock as a writer
 			 * which will prevent zvol_open() from opening it.
 			 */
 			zvol_os_clear_private(zv);
 
 			/* Drop zv_state_lock before zvol_free() */
 			mutex_exit(&zv->zv_state_lock);
 
 			/* Try parallel zv_free, if failed do it in place */
 			t = taskq_dispatch(system_taskq, zvol_free_task, zv,
 			    TQ_SLEEP);
 			if (t == TASKQID_INVALID)
 				list_insert_head(&free_list, zv);
 		} else {
 			mutex_exit(&zv->zv_state_lock);
 		}
 	}
 	rw_exit(&zvol_state_lock);
 
 	/* Drop zvol_state_lock before calling zvol_free() */
 	while ((zv = list_remove_head(&free_list)) != NULL)
 		zvol_os_free(zv);
 }
 
 /* Remove minor for this specific volume only */
 static void
 zvol_remove_minor_impl(const char *name)
 {
 	zvol_state_t *zv = NULL, *zv_next;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	rw_enter(&zvol_state_lock, RW_WRITER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (strcmp(zv->zv_name, name) == 0) {
 			/*
 			 * By holding zv_state_lock here, we guarantee that no
 			 * one is currently using this zv
 			 */
 
 			/* If in use, leave alone */
 			if (zv->zv_open_count > 0 ||
 			    atomic_read(&zv->zv_suspend_ref)) {
 				mutex_exit(&zv->zv_state_lock);
 				continue;
 			}
 			zvol_remove(zv);
 
 			zvol_os_clear_private(zv);
 			mutex_exit(&zv->zv_state_lock);
 			break;
 		} else {
 			mutex_exit(&zv->zv_state_lock);
 		}
 	}
 
 	/* Drop zvol_state_lock before calling zvol_free() */
 	rw_exit(&zvol_state_lock);
 
 	if (zv != NULL)
 		zvol_os_free(zv);
 }
 
 /*
  * Rename minors for specified dataset including children and snapshots.
  */
 static void
 zvol_rename_minors_impl(const char *oldname, const char *newname)
 {
 	zvol_state_t *zv, *zv_next;
 	int oldnamelen;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	oldnamelen = strlen(oldname);
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 
 		if (strcmp(zv->zv_name, oldname) == 0) {
 			zvol_os_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		    zv->zv_name[oldnamelen] == '@')) {
 			char *name = kmem_asprintf("%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
 			zvol_os_rename_minor(zv, name);
 			kmem_strfree(name);
 		}
 
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 }
 
 typedef struct zvol_snapdev_cb_arg {
 	uint64_t snapdev;
 } zvol_snapdev_cb_arg_t;
 
 static int
 zvol_set_snapdev_cb(const char *dsname, void *param)
 {
 	zvol_snapdev_cb_arg_t *arg = param;
 
 	if (strchr(dsname, '@') == NULL)
 		return (0);
 
 	switch (arg->snapdev) {
 		case ZFS_SNAPDEV_VISIBLE:
 			(void) zvol_os_create_minor(dsname);
 			break;
 		case ZFS_SNAPDEV_HIDDEN:
 			(void) zvol_remove_minor_impl(dsname);
 			break;
 	}
 
 	return (0);
 }
 
 static void
 zvol_set_snapdev_impl(char *name, uint64_t snapdev)
 {
 	zvol_snapdev_cb_arg_t arg = {snapdev};
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	/*
 	 * The zvol_set_snapdev_sync() sets snapdev appropriately
 	 * in the dataset hierarchy. Here, we only scan snapshots.
 	 */
 	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 zvol_set_volmode_impl(char *name, uint64_t volmode)
 {
 	fstrans_cookie_t cookie;
 	uint64_t old_volmode;
 	zvol_state_t *zv;
 
 	if (strchr(name, '@') != NULL)
 		return;
 
 	/*
 	 * It's unfortunate we need to remove minors before we create new ones:
 	 * this is necessary because our backing gendisk (zvol_state->zv_disk)
 	 * could be different when we set, for instance, volmode from "geom"
 	 * to "dev" (or vice versa).
 	 */
 	zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
 			return;
 	if (zv != NULL) {
 		old_volmode = zv->zv_volmode;
 		mutex_exit(&zv->zv_state_lock);
 		if (old_volmode == volmode)
 			return;
 		zvol_wait_close(zv);
 	}
 	cookie = spl_fstrans_mark();
 	switch (volmode) {
 		case ZFS_VOLMODE_NONE:
 			(void) zvol_remove_minor_impl(name);
 			break;
 		case ZFS_VOLMODE_GEOM:
 		case ZFS_VOLMODE_DEV:
 			(void) zvol_remove_minor_impl(name);
 			(void) zvol_os_create_minor(name);
 			break;
 		case ZFS_VOLMODE_DEFAULT:
 			(void) zvol_remove_minor_impl(name);
 			if (zvol_volmode == ZFS_VOLMODE_NONE)
 				break;
 			else /* if zvol_volmode is invalid defaults to "geom" */
 				(void) zvol_os_create_minor(name);
 			break;
 	}
 	spl_fstrans_unmark(cookie);
 }
 
 static zvol_task_t *
 zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2,
     uint64_t value)
 {
 	zvol_task_t *task;
 
 	/* Never allow tasks on hidden names. */
 	if (name1[0] == '$')
 		return (NULL);
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->op = op;
 	task->value = value;
 
 	strlcpy(task->name1, name1, sizeof (task->name1));
 	if (name2 != NULL)
 		strlcpy(task->name2, name2, sizeof (task->name2));
 
 	return (task);
 }
 
 static void
 zvol_task_free(zvol_task_t *task)
 {
 	kmem_free(task, sizeof (zvol_task_t));
 }
 
 /*
  * The worker thread function performed asynchronously.
  */
 static void
 zvol_task_cb(void *arg)
 {
 	zvol_task_t *task = arg;
 
 	switch (task->op) {
 	case ZVOL_ASYNC_REMOVE_MINORS:
 		zvol_remove_minors_impl(task->name1);
 		break;
 	case ZVOL_ASYNC_RENAME_MINORS:
 		zvol_rename_minors_impl(task->name1, task->name2);
 		break;
 	case ZVOL_ASYNC_SET_SNAPDEV:
 		zvol_set_snapdev_impl(task->name1, task->value);
 		break;
 	case ZVOL_ASYNC_SET_VOLMODE:
 		zvol_set_volmode_impl(task->name1, task->value);
 		break;
 	default:
 		VERIFY(0);
 		break;
 	}
 
 	zvol_task_free(task);
 }
 
 typedef struct zvol_set_prop_int_arg {
 	const char *zsda_name;
 	uint64_t zsda_value;
 	zprop_source_t zsda_source;
 	zfs_prop_t zsda_prop;
 } zvol_set_prop_int_arg_t;
 
 /*
  * Sanity check the dataset for safe use by the sync task.  No additional
  * conditions are imposed.
  */
 static int
 zvol_set_common_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	dsl_dir_rele(dd, FTAG);
 
 	return (error);
 }
 
 static int
 zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	zvol_task_t *task;
 	uint64_t prop;
 
 	const char *prop_name = zfs_prop_to_name(zsda->zsda_prop);
 	dsl_dataset_name(ds, dsname);
 
 	if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
 		return (0);
 
 	switch (zsda->zsda_prop) {
 		case ZFS_PROP_VOLMODE:
 			task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname,
 			    NULL, prop);
 			break;
 		case ZFS_PROP_SNAPDEV:
 			task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname,
 			    NULL, prop);
 			break;
 		default:
 			task = NULL;
 			break;
 	}
 
 	if (task == NULL)
 		return (0);
 
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
 }
 
 /*
  * Traverse all child datasets and apply the property appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
  * dataset and read the effective "property" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
 zvol_set_common_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 		    &zsda->zsda_value, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
     uint64_t val)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
 	zsda.zsda_value = val;
 	zsda.zsda_prop = prop;
 
 	return (dsl_sync_task(ddname, zvol_set_common_check,
 	    zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 void
 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
     boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL);
 	if (task == NULL)
 		return;
 
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 boolean_t
 zvol_is_zvol(const char *name)
 {
 
 	return (zvol_os_is_zvol(name));
 }
 
 int
 zvol_init_impl(void)
 {
 	int i;
 
 	list_create(&zvol_state_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 	rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
 
 	zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
 	    KM_SLEEP);
 	for (i = 0; i < ZVOL_HT_SIZE; i++)
 		INIT_HLIST_HEAD(&zvol_htable[i]);
 
 	return (0);
 }
 
 void
 zvol_fini_impl(void)
 {
 	zvol_remove_minors_impl(NULL);
 
 	/*
 	 * The call to "zvol_remove_minors_impl" may dispatch entries to
 	 * the system_taskq, but it doesn't wait for those entries to
 	 * complete before it returns. Thus, we must wait for all of the
 	 * removals to finish, before we can continue.
 	 */
 	taskq_wait_outstanding(system_taskq, 0);
 
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 	list_destroy(&zvol_state_list);
 	rw_destroy(&zvol_state_lock);
 }
diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib
index 6b83b10d604d..51eff3023e73 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/include/blkdev.shlib
@@ -1,634 +1,634 @@
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 
 #
 # Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 # Copyright (c) 2012, 2019 by Delphix. All rights reserved.
 # Copyright 2016 Nexenta Systems, Inc.
 # Copyright (c) 2016, 2017 by Intel Corporation. All rights reserved.
 # Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 # Copyright (c) 2017 Datto Inc.
 # Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 # Copyright 2019 Richard Elling
 #
 
 #
 # Returns SCSI host number for the given disk
 #
 function get_scsi_host #disk
 {
 	typeset disk=$1
 	ls /sys/block/${disk}/device/scsi_device | cut -d : -f 1
 }
 
 #
 # Cause a scan of all scsi host adapters by default
 #
 # $1 optional host number
 #
 function scan_scsi_hosts
 {
 	typeset hostnum=${1}
 
 	if is_linux; then
 		if [[ -z $hostnum ]]; then
 			for host in /sys/class/scsi_host/host*; do
 				log_must eval "echo '- - -' > $host/scan"
 			done
 		else
 			log_note "/sys/class/scsi_host/host$hostnum/scan"
 			log_must eval \
 			    "echo '- - -' > /sys/class/scsi_host/host$hostnum/scan"
 		fi
 	fi
 }
 
 #
 # Wait for newly created block devices to have their minors created.
 # Additional arguments can be passed to udevadm trigger, with the expected
 # arguments to typically be a block device pathname. This is useful when
 # checking waiting on a specific device to settle rather than triggering
 # all devices and waiting for them all to settle.
 #
 # The udevadm settle timeout can be 120 or 180 seconds by default for
 # some distros. If a long delay is experienced, it could be due to some
 # strangeness in a malfunctioning device that isn't related to the devices
 # under test. To help debug this condition, a notice is given if settle takes
 # too long.
 #
 # Note: there is no meaningful return code if udevadm fails. Consumers
 # should not expect a return code (do not call as argument to log_must)
 #
 function block_device_wait
 {
 	if is_linux; then
 		udevadm trigger $* 2>/dev/null
 		typeset start=$SECONDS
 		udevadm settle
 		typeset elapsed=$((SECONDS - start))
 		[[ $elapsed > 60 ]] && \
 		    log_note udevadm settle time too long: $elapsed
 	elif is_freebsd; then
 		if [[ ${#@} -eq 0 ]]; then
 			# Do something that has to go through the geom event
 			# queue to complete.
 			sysctl kern.geom.conftxt >/dev/null
 			return
 		fi
 	fi
 	# Poll for the given paths to appear, but give up eventually.
 	typeset -i i
 	for (( i = 0; i < 5; ++i )); do
 		typeset missing=false
 		typeset dev
 		for dev in "${@}"; do
 			if ! [[ -e $dev ]]; then
 				missing=true
 				break
 			fi
 		done
 		if ! $missing; then
 			break
 		fi
 		sleep ${#@}
 	done
 }
 
 #
 # Check if the given device is physical device
 #
 function is_physical_device #device
 {
 	typeset device=${1#$DEV_DSKDIR/}
 	device=${device#$DEV_RDSKDIR/}
 
 	if is_linux; then
 		is_disk_device "$DEV_DSKDIR/$device" && \
 		[ -f /sys/module/loop/parameters/max_part ]
 	elif is_freebsd; then
 		is_disk_device "$DEV_DSKDIR/$device" && \
 		echo $device | grep -qE \
 		    -e '^a?da[0-9]+$' \
 		    -e '^md[0-9]+$' \
 		    -e '^mfid[0-9]+$' \
 		    -e '^nda[0-9]+$' \
 		    -e '^nvd[0-9]+$' \
 		    -e '^vtbd[0-9]+$'
 	else
 		echo $device | grep -qE "^c[0-F]+([td][0-F]+)+$"
 	fi
 }
 
 #
 # Check if the given device is a real device (ie SCSI device)
 #
 function is_real_device #disk
 {
 	typeset disk=$1
 	[[ -z $disk ]] && log_fail "No argument for disk given."
 
 	if is_linux; then
 		lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
 		    grep -q disk
 	fi
 }
 
 #
 # Check if the given device is a loop device
 #
 function is_loop_device #disk
 {
 	typeset disk=$1
 	[[ -z $disk ]] && log_fail "No argument for disk given."
 
 	if is_linux; then
 		lsblk $DEV_RDSKDIR/$disk -o TYPE 2>/dev/null | \
 		    grep -q loop
 	fi
 }
 
 #
 # Linux:
 # Check if the given device is a multipath device and if there is a symbolic
 # link to a device mapper and to a disk
 # Currently no support for dm devices alone without multipath
 #
 # FreeBSD:
 # Check if the given device is a gmultipath device.
 #
 # Others:
 # No multipath detection.
 #
 function is_mpath_device #disk
 {
 	typeset disk=$1
 	[[ -z $disk ]] && log_fail "No argument for disk given."
 
 	if is_linux; then
 		if lsblk $DEV_MPATHDIR/$disk -o TYPE 2>/dev/null | \
 		   grep -q mpath; then
 			readlink $DEV_MPATHDIR/$disk > /dev/null 2>&1
 		else
 			false
 		fi
 	elif is_freebsd; then
 		is_disk_device $DEV_MPATHDIR/$disk
 	else
 		false
 	fi
 }
 
 #
 # Check if the given path is the appropriate sort of device special node.
 #
 function is_disk_device #path
 {
 	typeset path=$1
 
 	if is_freebsd; then
 		# FreeBSD doesn't have block devices, only character devices.
 		test -c $path
 	else
 		test -b $path
 	fi
 }
 
 # Set the slice prefix for disk partitioning depending
 # on whether the device is a real, multipath, or loop device.
 # Currently all disks have to be of the same type, so only
 # checks first disk to determine slice prefix.
 #
 function set_slice_prefix
 {
 	typeset disk
 	typeset -i i=0
 
 	if is_linux; then
 		while (( i < $DISK_ARRAY_NUM )); do
 			disk="$(echo $DISKS | awk '{print $(i + 1)}')"
 			if is_mpath_device $disk && ! echo $disk | awk 'substr($1,18,1) ~ /^[[:digit:]]+$/ {exit 1}' || is_real_device $disk; then
 				export SLICE_PREFIX=""
 				return 0
 			elif is_mpath_device $disk || is_loop_device $disk; then
 				export SLICE_PREFIX="p"
 				return 0
 			else
 				log_fail "$disk not supported for partitioning."
 			fi
 			(( i = i + 1))
 		done
 	fi
 }
 
 #
 # Set the directory path of the listed devices in $DISK_ARRAY_NUM
 # Currently all disks have to be of the same type, so only
 # checks first disk to determine device directory
 # default = /dev (linux)
 # real disk = /dev (linux)
 # multipath device = /dev/mapper (linux)
 #
 function set_device_dir
 {
 	typeset disk
 	typeset -i i=0
 
 	if is_linux; then
 		while (( i < $DISK_ARRAY_NUM )); do
 			disk="$(echo $DISKS | awk '{print $(i + 1)}')"
 			if is_mpath_device $disk; then
 				export DEV_DSKDIR=$DEV_MPATHDIR
 				return 0
 			else
 				export DEV_DSKDIR=$DEV_RDSKDIR
 				return 0
 			fi
 			(( i = i + 1))
 		done
 	else
 		export DEV_DSKDIR=$DEV_RDSKDIR
 	fi
 }
 
 #
 # Get the directory path of given device
 #
 function get_device_dir #device
 {
 	typeset device=$1
 
 	if ! is_freebsd && ! is_physical_device $device; then
 		if [[ $device != "/" ]]; then
 			device=${device%/*}
 		fi
 		if is_disk_device "$DEV_DSKDIR/$device"; then
 			device="$DEV_DSKDIR"
 		fi
 		echo $device
 	else
 		echo "$DEV_DSKDIR"
 	fi
 }
 
 #
 # Get persistent name for given disk
 #
 function get_persistent_disk_name #device
 {
 	typeset device=$1
 
 	if is_linux; then
 		if is_real_device $device; then
 			udevadm info -q all -n $DEV_DSKDIR/$device \
-			    | awk '/disk\/by-id/ {print $2; exit}' | cut -d/ -f3
+			    | awk '/disk\/by-id/ {print $2; exit}' | cut -d/ -f3-
 		elif is_mpath_device $device; then
 			udevadm info -q all -n $DEV_DSKDIR/$device \
 			    | awk '/disk\/by-id\/dm-uuid/ {print $2; exit}' \
 			    | cut -d/ -f3
 		else
 			echo $device
 		fi
 	else
 		echo $device
 	fi
 }
 
 #
 # Online or offline a disk on the system
 #
 # First checks state of disk. Test will fail if disk is not properly onlined
 # or offlined. Online is a full rescan of SCSI disks by echoing to every
 # host entry.
 #
 function on_off_disk # disk state{online,offline} host
 {
 	typeset disk=$1
 	typeset state=$2
 	typeset host=$3
 
 	[[ -z $disk ]] || [[ -z $state ]] &&  \
 	    log_fail "Arguments invalid or missing"
 
 	if is_linux; then
 		if [[ $state == "offline" ]] && ( is_mpath_device $disk ); then
 			dm_name="$(readlink $DEV_DSKDIR/$disk | cut -d/ -f2)"
 			dep="$(ls /sys/block/${dm_name}/slaves | awk '{print $1}')"
 			while [[ -n $dep ]]; do
 				#check if disk is online
 				if lsscsi | grep -qF $dep; then
 					dep_dir="/sys/block/${dm_name}"
 					dep_dir+="/slaves/${dep}/device"
 					ss="${dep_dir}/state"
 					sd="${dep_dir}/delete"
 					log_must eval "echo 'offline' > ${ss}"
 					log_must eval "echo '1' > ${sd}"
 					if lsscsi | grep -qF $dep; then
 						log_fail "Offlining $disk failed"
 					fi
 				fi
 				dep="$(ls /sys/block/$dm_name/slaves 2>/dev/null | awk '{print $1}')"
 			done
 		elif [[ $state == "offline" ]] && ( is_real_device $disk ); then
 			#check if disk is online
 			if lsscsi | grep -qF $disk; then
 				dev_state="/sys/block/$disk/device/state"
 				dev_delete="/sys/block/$disk/device/delete"
 				log_must eval "echo 'offline' > ${dev_state}"
 				log_must eval "echo '1' > ${dev_delete}"
 				if lsscsi | grep -qF $disk; then
 					log_fail "Offlining $disk failed"
 				fi
 			else
 				log_note "$disk is already offline"
 			fi
 		elif [[ $state == "online" ]]; then
 			#force a full rescan
 			scan_scsi_hosts $host
 			block_device_wait
 			if is_mpath_device $disk; then
 				dm_name="$(readlink $DEV_DSKDIR/$disk | cut -d/ -f2)"
 				dep="$(ls /sys/block/$dm_name/slaves | awk '{print $1}')"
 				if lsscsi | grep -qF $dep; then
 					log_fail "Onlining $disk failed"
 				fi
 			elif is_real_device $disk; then
 				block_device_wait
 				typeset -i retries=0
 				while ! lsscsi | grep -qF $disk; do
 					if (( $retries > 2 )); then
 						log_fail "Onlining $disk failed"
 						break
 					fi
 					(( ++retries ))
 					sleep 1
 				done
 			else
 				log_fail "$disk is not a real dev"
 			fi
 		else
 			log_fail "$disk failed to $state"
 		fi
 	fi
 }
 
 #
 # Simulate disk removal
 #
 function remove_disk #disk
 {
 	typeset disk=$1
 	on_off_disk $disk "offline"
 	block_device_wait
 }
 
 #
 # Simulate disk insertion for the given SCSI host
 #
 function insert_disk #disk scsi_host
 {
 	typeset disk=$1
 	typeset scsi_host=$2
 	on_off_disk $disk "online" $scsi_host
 	block_device_wait
 }
 
 #
 # Load scsi_debug module with specified parameters
 # $blksz can be either one of: < 512b | 512e | 4Kn >
 #
 function load_scsi_debug # dev_size_mb add_host num_tgts max_luns blksz
 {
 	typeset devsize=$1
 	typeset hosts=$2
 	typeset tgts=$3
 	typeset luns=$4
 	typeset blksz=$5
 
 	[[ -z $devsize ]] || [[ -z $hosts ]] || [[ -z $tgts ]] || \
 	    [[ -z $luns ]] || [[ -z $blksz ]] && \
 	    log_fail "Arguments invalid or missing"
 
 	case "$5" in
 		'512b')
 			typeset sector=512
 			typeset blkexp=0
 		;;
 		'512e')
 			typeset sector=512
 			typeset blkexp=3
 		;;
 		'4Kn')
 			typeset sector=4096
 			typeset blkexp=0
 		;;
 		*) log_fail "Unsupported blksz value: $5" ;;
 	esac
 
 	if is_linux; then
 		modprobe -n scsi_debug ||
 			log_unsupported "Platform does not have scsi_debug module"
 		if lsmod | grep -q scsi_debug; then
 			log_fail "scsi_debug module already installed"
 		else
 			log_must modprobe scsi_debug dev_size_mb=$devsize \
 			    add_host=$hosts num_tgts=$tgts max_luns=$luns \
 			    sector_size=$sector physblk_exp=$blkexp
 			block_device_wait
 			if ! lsscsi | grep -q scsi_debug; then
 				log_fail "scsi_debug module install failed"
 			fi
 		fi
 	fi
 }
 
 #
 # Unload scsi_debug module, if needed.
 #
 function unload_scsi_debug
 {
 	log_must_retry "in use" 5 modprobe -r scsi_debug
 }
 
 #
 # Get scsi_debug device name.
 # Returns basename of scsi_debug device (for example "sdb").
 #
 function get_debug_device
 {
 	for i in {1..10} ; do
 		val=$(lsscsi | awk '/scsi_debug/ {print $6; exit}' | cut -d/ -f3)
 
 		# lsscsi can take time to settle
 		if [ "$val" != "-" ] ; then
 			break
 		fi
 		sleep 1
 	done
 	echo "$val"
 }
 
 #
 # Get actual devices used by the pool (i.e. linux sdb1 not sdb).
 #
 function get_pool_devices #testpool #devdir
 {
 	typeset testpool=$1
 	typeset devdir=$2
 	typeset out=""
 
 	case "$UNAME" in
 	Linux|FreeBSD)
 		zpool status -P $testpool | awk -v d="$devdir" '$1 ~ d {sub(d "/", ""); printf("%s ", $1)}'
 		;;
 	esac
 }
 
 #
 # Write to standard out giving the level, device name, offset and length
 # of all blocks in an input file. The offset and length are in units of
 # 512 byte blocks. In the case of mirrored vdevs, only the first
 # device is listed, as the levels, blocks and offsets will be the same
 # on other devices. Note that this function only works with mirrored
 # or non-redundant pools, not raidz.
 #
 # The output of this function can be used to introduce corruption at
 # varying levels of indirection.
 #
 function list_file_blocks # input_file
 {
 	typeset input_file=$1
 
 	[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
 
 	typeset ds="$(zfs list -H -o name $input_file)"
 	typeset pool="${ds%%/*}"
 	typeset objnum="$(get_objnum $input_file)"
 
 	#
 	# Establish a mapping between vdev ids as shown in a DVA and the
 	# pathnames they correspond to in ${VDEV_MAP[][]}.
 	#
 	# The vdev bits in a DVA refer to the top level vdev id.
 	# ${VDEV_MAP[$id]} is an array of the vdev paths within that vdev.
 	#
 	eval $(zdb -C $pool | awk '
 	    BEGIN { printf "typeset -a VDEV_MAP;" }
 	    function subscript(s) {
 	        # "[#]" is more convenient than the bare "#"
 	        match(s, /\[[0-9]*\]/)
 		return substr(s, RSTART, RLENGTH)
 	    }
 	    id && !/^                / {
 	        # left a top level vdev
 	        id = 0
 	    }
 	    id && $1 ~ /^path:$/ {
 	        # found a vdev path; save it in the map
 	        printf "VDEV_MAP%s%s=%s;", id, child, $2
 	    }
 	    /^            children/ {
 	        # entering a top level vdev
 	        id = subscript($0)
 		child = "[0]" # default in case there is no nested vdev
 		printf "typeset -a VDEV_MAP%s;", id
 	    }
 	    /^                children/ {
 	        # entering a nested vdev (e.g. child of a top level mirror)
 	        child = subscript($0)
 	    }
 	')
 
 	#
 	# The awk below parses the output of zdb, printing out the level
 	# of each block along with vdev id, offset and length. The last
 	# two are converted to decimal in the while loop. 4M is added to
 	# the offset to compensate for the first two labels and boot
 	# block. Lastly, the offset and length are printed in units of
 	# 512B blocks for ease of use with dd.
 	#
 	typeset level vdev path offset length
 	if awk -n '' 2>/dev/null; then
 		# gawk needs -n to decode hex
 		AWK='awk -n'
 	else
 		AWK='awk'
 	fi
 	sync_all_pools true
 	zdb -dddddd $ds $objnum | $AWK -v pad=$((4<<20)) -v bs=512 '
 	    /^$/ { looking = 0 }
 	    looking {
 	        level = $2
 	        field = 3
 	        while (split($field, dva, ":") == 3) {
 	            # top level vdev id
 	            vdev = int(dva[1])
 	            # offset + 4M label/boot pad in 512B blocks
 	            offset = (int("0x"dva[2]) + pad) / bs
 		    # length in 512B blocks
 		    len = int("0x"dva[3]) / bs
 
 	            print level, vdev, offset, len
 
 	            ++field
 	        }
 	    }
 	    /^Indirect blocks:/ { looking = 1 }
 	' | \
 	while read level vdev offset length; do
 		for path in ${VDEV_MAP[$vdev][@]}; do
 			echo "$level $path $offset $length"
 		done
 	done 2>/dev/null
 }
 
 function corrupt_blocks_at_level # input_file corrupt_level
 {
 	typeset input_file=$1
 	typeset corrupt_level="L${2:-0}"
 	typeset level path offset length
 
 	[[ -f $input_file ]] || log_fail "Couldn't find $input_file"
 
 	if is_freebsd; then
 		# Temporarily allow corrupting an inuse device.
 		debugflags=$(sysctl -n kern.geom.debugflags)
 		sysctl kern.geom.debugflags=16
 	fi
 
 	list_file_blocks $input_file | \
 	while read level path offset length; do
 		if [[ $level = $corrupt_level ]]; then
 			log_must dd if=/dev/urandom of=$path bs=512 \
 			    count=$length seek=$offset conv=notrunc
 		fi
 	done
 
 	if is_freebsd; then
 		sysctl kern.geom.debugflags=$debugflags
 	fi
 
 	# This is necessary for pools made of loop devices.
 	sync
 }
 
 function corrupt_label_checksum # label_number vdev_path
 {
 	typeset label_size=$((256*1024))
 	typeset vdev_size=$(stat_size ${2})
 	typeset -a offsets=("$((128*1024 - 32))" \
 	    "$(($label_size + (128*1024 - 32)))" \
 	    "$(($vdev_size - $label_size - (128*1024 + 32)))" \
 	    "$(($vdev_size - (128*1024 + 32)))")
 
 	dd if=/dev/urandom of=${2} seek=${offsets[$1]} bs=1 count=32 \
 	    conv=notrunc
 }
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index aab672a77324..24fb572f00b0 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -1,1149 +1,1149 @@
 /*
  */
 
 /* zfs_config.h.  Generated from zfs_config.h.in by configure.  */
 /* zfs_config.h.in.  Generated from configure.ac by autoheader.  */
 
 /* Define to 1 if translation of program messages to the user's native
    language is requested. */
 /* #undef ENABLE_NLS */
 
 /* bio_end_io_t wants 1 arg */
 /* #undef HAVE_1ARG_BIO_END_IO_T */
 
 /* lookup_bdev() wants 1 arg */
 /* #undef HAVE_1ARG_LOOKUP_BDEV */
 
 /* submit_bio() wants 1 arg */
 /* #undef HAVE_1ARG_SUBMIT_BIO */
 
 /* bdi_setup_and_register() wants 2 args */
 /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */
 
 /* vfs_getattr wants 2 args */
 /* #undef HAVE_2ARGS_VFS_GETATTR */
 
 /* zlib_deflate_workspacesize() wants 2 args */
 /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
 
 /* bdi_setup_and_register() wants 3 args */
 /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */
 
 /* vfs_getattr wants 3 args */
 /* #undef HAVE_3ARGS_VFS_GETATTR */
 
 /* vfs_getattr wants 4 args */
 /* #undef HAVE_4ARGS_VFS_GETATTR */
 
 /* kernel has access_ok with 'type' parameter */
 /* #undef HAVE_ACCESS_OK_TYPE */
 
 /* posix_acl has refcount_t */
 /* #undef HAVE_ACL_REFCOUNT */
 
 /* add_disk() returns int */
 /* #undef HAVE_ADD_DISK_RET */
 
 /* Define if host toolchain supports AES */
 #define HAVE_AES 1
 
 /* Define if you have [rt] */
 #define HAVE_AIO_H 1
 
 #ifdef __amd64__
 #ifndef RESCUE
 /* Define if host toolchain supports AVX */
 #define HAVE_AVX 1
 #endif
 
 /* Define if host toolchain supports AVX2 */
 #define HAVE_AVX2 1
 
 /* Define if host toolchain supports AVX512BW */
 #define HAVE_AVX512BW 1
 
 /* Define if host toolchain supports AVX512CD */
 #define HAVE_AVX512CD 1
 
 /* Define if host toolchain supports AVX512DQ */
 #define HAVE_AVX512DQ 1
 
 /* Define if host toolchain supports AVX512ER */
 #define HAVE_AVX512ER 1
 
 /* Define if host toolchain supports AVX512F */
 #define HAVE_AVX512F 1
 
 /* Define if host toolchain supports AVX512IFMA */
 #define HAVE_AVX512IFMA 1
 
 /* Define if host toolchain supports AVX512PF */
 #define HAVE_AVX512PF 1
 
 /* Define if host toolchain supports AVX512VBMI */
 #define HAVE_AVX512VBMI 1
 
 /* Define if host toolchain supports AVX512VL */
 #define HAVE_AVX512VL 1
 #endif
 
 /* bdevname() is available */
 /* #undef HAVE_BDEVNAME */
 
 /* bdev_check_media_change() exists */
 /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_63 */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_OLD */
 
 /* bdev_kobj() exists */
 /* #undef HAVE_BDEV_KOBJ */
 
 /* bdev_max_discard_sectors() is available */
 /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */
 
 /* bdev_max_secure_erase_sectors() is available */
 /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */
 
 /* block_device_operations->submit_bio() returns void */
 /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */
 
 /* bdev_whole() is available */
 /* #undef HAVE_BDEV_WHOLE */
 
 /* bio_alloc() takes 4 arguments */
 /* #undef HAVE_BIO_ALLOC_4ARG */
 
 /* bio->bi_bdev->bd_disk exists */
 /* #undef HAVE_BIO_BDEV_DISK */
 
 /* bio->bi_opf is defined */
 /* #undef HAVE_BIO_BI_OPF */
 
 /* bio->bi_status exists */
 /* #undef HAVE_BIO_BI_STATUS */
 
 /* bio has bi_iter */
 /* #undef HAVE_BIO_BVEC_ITER */
 
 /* bio_*_io_acct() available */
 /* #undef HAVE_BIO_IO_ACCT */
 
 /* bio_max_segs() is implemented */
 /* #undef HAVE_BIO_MAX_SEGS */
 
 /* bio_set_dev() is available */
 /* #undef HAVE_BIO_SET_DEV */
 
 /* bio_set_dev() GPL-only */
 /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */
 
 /* bio_set_dev() is a macro */
 /* #undef HAVE_BIO_SET_DEV_MACRO */
 
 /* bio_set_op_attrs is available */
 /* #undef HAVE_BIO_SET_OP_ATTRS */
 
 /* blkdev_get_by_path() exists and takes 4 args */
 /* #undef HAVE_BLKDEV_GET_BY_PATH_4ARG */
 
 /* blkdev_get_by_path() handles ERESTARTSYS */
 /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */
 
 /* blkdev_issue_discard() is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD */
 
 /* blkdev_issue_secure_erase() is available */
 /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */
 
 /* blkdev_put() accepts void* as arg 2 */
 /* #undef HAVE_BLKDEV_PUT_HOLDER */
 
 /* blkdev_reread_part() exists */
 /* #undef HAVE_BLKDEV_REREAD_PART */
 
 /* blkg_tryget() is available */
 /* #undef HAVE_BLKG_TRYGET */
 
 /* blkg_tryget() GPL-only */
 /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */
 
 /* blk_alloc_disk() exists */
 /* #undef HAVE_BLK_ALLOC_DISK */
 
 /* blk_alloc_queue() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */
 
 /* blk_alloc_queue_rh() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */
 
 /* blk_cleanup_disk() exists */
 /* #undef HAVE_BLK_CLEANUP_DISK */
 
 /* blk_mode_t is defined */
 /* #undef HAVE_BLK_MODE_T */
 
 /* block multiqueue is available */
 /* #undef HAVE_BLK_MQ */
 
 /* blk queue backing_dev_info is dynamic */
 /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */
 
 /* blk_queue_discard() is available */
 /* #undef HAVE_BLK_QUEUE_DISCARD */
 
 /* blk_queue_flag_clear() exists */
 /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */
 
 /* blk_queue_flag_set() exists */
 /* #undef HAVE_BLK_QUEUE_FLAG_SET */
 
 /* blk_queue_flush() is available */
 /* #undef HAVE_BLK_QUEUE_FLUSH */
 
 /* blk_queue_flush() is GPL-only */
 /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
 
 /* blk_queue_secdiscard() is available */
 /* #undef HAVE_BLK_QUEUE_SECDISCARD */
 
 /* blk_queue_secure_erase() is available */
 /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */
 
 /* blk_queue_update_readahead() exists */
 /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */
 
 /* blk_queue_write_cache() exists */
 /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */
 
 /* blk_queue_write_cache() is GPL-only */
 /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */
 
 /* BLK_STS_RESV_CONFLICT is defined */
 /* #undef HAVE_BLK_STS_RESV_CONFLICT */
 
 /* Define if release() in block_device_operations takes 1 arg */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG */
 
 /* Define if revalidate_disk() in block_device_operations */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */
 
 /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the
    CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYCURRENT */
 
 /* Define to 1 if you have the Mac OS X function
    CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */
 
 /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in
    the CoreFoundation framework. */
 /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */
 
 /* check_disk_change() exists */
 /* #undef HAVE_CHECK_DISK_CHANGE */
 
 /* clear_inode() is available */
 /* #undef HAVE_CLEAR_INODE */
 
 /* dentry uses const struct dentry_operations */
 /* #undef HAVE_CONST_DENTRY_OPERATIONS */
 
 /* copy_from_iter() is available */
 /* #undef HAVE_COPY_FROM_ITER */
 
 /* copy_splice_read exists */
 /* #undef HAVE_COPY_SPLICE_READ */
 
 /* copy_to_iter() is available */
 /* #undef HAVE_COPY_TO_ITER */
 
 /* cpu_has_feature() is GPL-only */
 /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */
 
 /* yes */
 /* #undef HAVE_CPU_HOTPLUG */
 
 /* current_time() exists */
 /* #undef HAVE_CURRENT_TIME */
 
 /* Define if the GNU dcgettext() function is already present or preinstalled.
    */
 /* #undef HAVE_DCGETTEXT */
 
 /* DECLARE_EVENT_CLASS() is available */
 /* #undef HAVE_DECLARE_EVENT_CLASS */
 
 /* dentry aliases are in d_u member */
 /* #undef HAVE_DENTRY_D_U_ALIASES */
 
 /* dequeue_signal() takes 4 arguments */
 /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */
 
 /* lookup_bdev() wants dev_t arg */
 /* #undef HAVE_DEVT_LOOKUP_BDEV */
 
 /* sops->dirty_inode() wants flags */
 /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */
 
 /* disk_check_media_change() exists */
 /* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */
 
 /* disk_*_io_acct() available */
 /* #undef HAVE_DISK_IO_ACCT */
 
 /* disk_update_readahead() exists */
 /* #undef HAVE_DISK_UPDATE_READAHEAD */
 
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
 /* d_make_root() is available */
 /* #undef HAVE_D_MAKE_ROOT */
 
 /* d_prune_aliases() is available */
 /* #undef HAVE_D_PRUNE_ALIASES */
 
 /* dops->d_revalidate() operation takes nameidata */
 /* #undef HAVE_D_REVALIDATE_NAMEIDATA */
 
 /* eops->encode_fh() wants child and parent inodes */
 /* #undef HAVE_ENCODE_FH_WITH_INODE */
 
 /* sops->evict_inode() exists */
 /* #undef HAVE_EVICT_INODE */
 
 /* FALLOC_FL_ZERO_RANGE is defined */
 /* #undef HAVE_FALLOC_FL_ZERO_RANGE */
 
 /* fault_in_iov_iter_readable() is available */
 /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */
 
 /* filemap_range_has_page() is available */
 /* #undef HAVE_FILEMAP_RANGE_HAS_PAGE */
 
 /* fops->aio_fsync() exists */
 /* #undef HAVE_FILE_AIO_FSYNC */
 
 /* file_dentry() is available */
 /* #undef HAVE_FILE_DENTRY */
 
 /* fops->fadvise() exists */
 /* #undef HAVE_FILE_FADVISE */
 
 /* file_inode() is available */
 /* #undef HAVE_FILE_INODE */
 
 /* flush_dcache_page() is GPL-only */
 /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */
 
 /* iops->follow_link() cookie */
 /* #undef HAVE_FOLLOW_LINK_COOKIE */
 
 /* iops->follow_link() nameidata */
 /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */
 
 /* Define if compiler supports -Wformat-overflow */
 /* #undef HAVE_FORMAT_OVERFLOW */
 
 /* fsync_bdev() is declared in include/blkdev.h */
 /* #undef HAVE_FSYNC_BDEV */
 
 /* fops->fsync() with range */
 /* #undef HAVE_FSYNC_RANGE */
 
 /* fops->fsync() without dentry */
 /* #undef HAVE_FSYNC_WITHOUT_DENTRY */
 
 /* yes */
 /* #undef HAVE_GENERIC_FADVISE */
 
 /* generic_fillattr requires struct mnt_idmap* */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP */
 
 /* generic_fillattr requires struct mnt_idmap* and u32 request_mask */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK */
 
 /* generic_fillattr requires struct user_namespace* */
 /* #undef HAVE_GENERIC_FILLATTR_USERNS */
 
 /* generic_*_io_acct() 3 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_3ARG */
 
 /* generic_*_io_acct() 4 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_4ARG */
 
 /* generic_readlink is global */
 /* #undef HAVE_GENERIC_READLINK */
 
 /* generic_setxattr() exists */
 /* #undef HAVE_GENERIC_SETXATTR */
 
 /* generic_write_checks() takes kiocb */
 /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */
 
 /* Define if the GNU gettext() function is already present or preinstalled. */
 /* #undef HAVE_GETTEXT */
 
 /* iops->get_acl() exists */
 /* #undef HAVE_GET_ACL */
 
 /* iops->get_acl() takes rcu */
 /* #undef HAVE_GET_ACL_RCU */
 
 /* has iops->get_inode_acl() */
 /* #undef HAVE_GET_INODE_ACL */
 
 /* iops->get_link() cookie */
 /* #undef HAVE_GET_LINK_COOKIE */
 
 /* iops->get_link() delayed */
 /* #undef HAVE_GET_LINK_DELAYED */
 
 /* group_info->gid exists */
 /* #undef HAVE_GROUP_INFO_GID */
 
 /* has_capability() is available */
 /* #undef HAVE_HAS_CAPABILITY */
 
 /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */
 /* #undef HAVE_IATTR_VFSID */
 
 /* Define if you have the iconv() function and it works. */
 #define HAVE_ICONV 1
 
 /* iops->getattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_GETATTR */
 
 /* iops->setattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_SETATTR */
 
 /* APIs for idmapped mount are present */
 /* #undef HAVE_IDMAP_MNT_API */
 
 /* Define if compiler supports -Wimplicit-fallthrough */
 /* #undef HAVE_IMPLICIT_FALLTHROUGH */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_INFINITE_RECURSION */
 
 /* inode_get_ctime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_CTIME */
 
 /* yes */
 /* #undef HAVE_INODE_LOCK_SHARED */
 
 /* inode_owner_or_capable() exists */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE */
 
 /* inode_owner_or_capable() takes mnt_idmap */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */
 
 /* inode_owner_or_capable() takes user_ns */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */
 
 /* inode_set_ctime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_CTIME_TO_TS */
 
 /* inode_set_flags() exists */
 /* #undef HAVE_INODE_SET_FLAGS */
 
 /* inode_set_iversion() exists */
 /* #undef HAVE_INODE_SET_IVERSION */
 
 /* inode->i_*time's are timespec64 */
 /* #undef HAVE_INODE_TIMESPEC64_TIMES */
 
 /* timestamp_truncate() exists */
 /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
 /* in_compat_syscall() is available */
 /* #undef HAVE_IN_COMPAT_SYSCALL */
 
 /* iops->create() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_CREATE_IDMAP */
 
 /* iops->create() takes struct user_namespace* */
 /* #undef HAVE_IOPS_CREATE_USERNS */
 
 /* iops->mkdir() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKDIR_IDMAP */
 
 /* iops->mkdir() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKDIR_USERNS */
 
 /* iops->mknod() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKNOD_IDMAP */
 
 /* iops->mknod() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKNOD_USERNS */
 
 /* iops->permission() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_PERMISSION_IDMAP */
 
 /* iops->permission() takes struct user_namespace* */
 /* #undef HAVE_IOPS_PERMISSION_USERNS */
 
 /* iops->rename() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_RENAME_IDMAP */
 
 /* iops->rename() takes struct user_namespace* */
 /* #undef HAVE_IOPS_RENAME_USERNS */
 
 /* iops->setattr() exists */
 /* #undef HAVE_IOPS_SETATTR */
 
 /* iops->symlink() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_SYMLINK_IDMAP */
 
 /* iops->symlink() takes struct user_namespace* */
 /* #undef HAVE_IOPS_SYMLINK_USERNS */
 
 /* iov_iter_advance() is available */
 /* #undef HAVE_IOV_ITER_ADVANCE */
 
 /* iov_iter_count() is available */
 /* #undef HAVE_IOV_ITER_COUNT */
 
 /* iov_iter_fault_in_readable() is available */
 /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */
 
 /* iov_iter_revert() is available */
 /* #undef HAVE_IOV_ITER_REVERT */
 
 /* iov_iter_type() is available */
 /* #undef HAVE_IOV_ITER_TYPE */
 
 /* iov_iter types are available */
 /* #undef HAVE_IOV_ITER_TYPES */
 
 /* yes */
 /* #undef HAVE_IO_SCHEDULE_TIMEOUT */
 
 /* Define to 1 if you have the `issetugid' function. */
 #define HAVE_ISSETUGID 1
 
 /* iter_iov() is available */
 /* #undef HAVE_ITER_IOV */
 
 /* kernel has kernel_fpu_* functions */
 /* #undef HAVE_KERNEL_FPU */
 
 /* kernel has asm/fpu/api.h */
 /* #undef HAVE_KERNEL_FPU_API_HEADER */
 
 /* kernel fpu internal */
 /* #undef HAVE_KERNEL_FPU_INTERNAL */
 
 /* kernel has asm/fpu/internal.h */
 /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */
 
 /* uncached_acl_sentinel() exists */
 /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_KERNEL_INFINITE_RECURSION */
 
 /* kernel does stack verification */
 /* #undef HAVE_KERNEL_OBJTOOL */
 
 /* kernel has linux/objtool.h */
 /* #undef HAVE_KERNEL_OBJTOOL_HEADER */
 
 /* kernel_read() take loff_t pointer */
 /* #undef HAVE_KERNEL_READ_PPOS */
 
 /* timer_list.function gets a timer_list */
 /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */
 
 /* struct timer_list has a flags member */
 /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */
 
 /* timer_setup() is available */
 /* #undef HAVE_KERNEL_TIMER_SETUP */
 
 /* kernel_write() take loff_t pointer */
 /* #undef HAVE_KERNEL_WRITE_PPOS */
 
 /* kmem_cache_create_usercopy() exists */
 /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */
 
 /* kstrtoul() exists */
 /* #undef HAVE_KSTRTOUL */
 
 /* ktime_get_coarse_real_ts64() exists */
 /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */
 
 /* ktime_get_raw_ts64() exists */
 /* #undef HAVE_KTIME_GET_RAW_TS64 */
 
 /* kvmalloc exists */
 /* #undef HAVE_KVMALLOC */
 
 /* Define if you have [aio] */
 /* #undef HAVE_LIBAIO */
 
 /* Define if you have [blkid] */
 /* #undef HAVE_LIBBLKID */
 
 /* Define if you have [crypto] */
 #define HAVE_LIBCRYPTO 1
 
 /* Define if you have [tirpc] */
 /* #undef HAVE_LIBTIRPC */
 
 /* Define if you have [udev] */
 /* #undef HAVE_LIBUDEV */
 
 /* Define if you have [uuid] */
 /* #undef HAVE_LIBUUID */
 
 /* linux/blk-cgroup.h exists */
 /* #undef HAVE_LINUX_BLK_CGROUP_HEADER */
 
 /* lseek_execute() is available */
 /* #undef HAVE_LSEEK_EXECUTE */
 
 /* makedev() is declared in sys/mkdev.h */
 /* #undef HAVE_MAKEDEV_IN_MKDEV */
 
 /* makedev() is declared in sys/sysmacros.h */
 /* #undef HAVE_MAKEDEV_IN_SYSMACROS */
 
 /* Noting that make_request_fn() returns blk_qc_t */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */
 
 /* Noting that make_request_fn() returns void */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */
 
 /* iops->mkdir() takes umode_t */
 /* #undef HAVE_MKDIR_UMODE_T */
 
 /* Define to 1 if you have the `mlockall' function. */
 #define HAVE_MLOCKALL 1
 
 /* lookup_bdev() wants mode arg */
 /* #undef HAVE_MODE_LOOKUP_BDEV */
 
 /* Define if host toolchain supports MOVBE */
 #define HAVE_MOVBE 1
 
 /* new_sync_read()/new_sync_write() are available */
 /* #undef HAVE_NEW_SYNC_READ */
 
 /* folio_wait_bit() exists */
 /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */
 
 /* part_to_dev() exists */
 /* #undef HAVE_PART_TO_DEV */
 
 /* iops->getattr() takes a path */
 /* #undef HAVE_PATH_IOPS_GETATTR */
 
 /* Define if host toolchain supports PCLMULQDQ */
 #define HAVE_PCLMULQDQ 1
 
 /* percpu_counter_add_batch() is defined */
 /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */
 
 /* percpu_counter_init() wants gfp_t */
 /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */
 
 /* posix_acl_chmod() exists */
 /* #undef HAVE_POSIX_ACL_CHMOD */
 
 /* posix_acl_from_xattr() needs user_ns */
 /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */
 
 /* posix_acl_release() is available */
 /* #undef HAVE_POSIX_ACL_RELEASE */
 
 /* posix_acl_release() is GPL-only */
 /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */
 
 /* posix_acl_valid() wants user namespace */
 /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */
 
 /* proc_ops structure exists */
 /* #undef HAVE_PROC_OPS_STRUCT */
 
 /* iops->put_link() cookie */
 /* #undef HAVE_PUT_LINK_COOKIE */
 
 /* iops->put_link() delayed */
 /* #undef HAVE_PUT_LINK_DELAYED */
 
 /* iops->put_link() nameidata */
 /* #undef HAVE_PUT_LINK_NAMEIDATA */
 
 /* If available, contains the Python version number currently in use. */
 #define HAVE_PYTHON "3.7"
 
 /* qat is enabled and existed */
 /* #undef HAVE_QAT */
 
 /* struct reclaim_state has reclaimed */
 /* #undef HAVE_RECLAIM_STATE_RECLAIMED */
 
 /* register_shrinker is vararg */
 /* #undef HAVE_REGISTER_SHRINKER_VARARG */
 
 /* register_sysctl_table exists */
 /* #undef HAVE_REGISTER_SYSCTL_TABLE */
 
 /* iops->rename2() exists */
 /* #undef HAVE_RENAME2 */
 
 /* struct inode_operations_wrapper takes .rename2() */
 /* #undef HAVE_RENAME2_OPERATIONS_WRAPPER */
 
 /* iops->rename() wants flags */
 /* #undef HAVE_RENAME_WANTS_FLAGS */
 
 /* REQ_DISCARD is defined */
 /* #undef HAVE_REQ_DISCARD */
 
 /* REQ_FLUSH is defined */
 /* #undef HAVE_REQ_FLUSH */
 
 /* REQ_OP_DISCARD is defined */
 /* #undef HAVE_REQ_OP_DISCARD */
 
 /* REQ_OP_FLUSH is defined */
 /* #undef HAVE_REQ_OP_FLUSH */
 
 /* REQ_OP_SECURE_ERASE is defined */
 /* #undef HAVE_REQ_OP_SECURE_ERASE */
 
 /* REQ_PREFLUSH is defined */
 /* #undef HAVE_REQ_PREFLUSH */
 
 /* revalidate_disk() is available */
 /* #undef HAVE_REVALIDATE_DISK */
 
 /* revalidate_disk_size() is available */
 /* #undef HAVE_REVALIDATE_DISK_SIZE */
 
 /* struct rw_semaphore has member activity */
 /* #undef HAVE_RWSEM_ACTIVITY */
 
 /* struct rw_semaphore has atomic_long_t member count */
 /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */
 
 /* linux/sched/signal.h exists */
 /* #undef HAVE_SCHED_SIGNAL_HEADER */
 
 /* Define to 1 if you have the <security/pam_modules.h> header file. */
 #define HAVE_SECURITY_PAM_MODULES_H 1
 
 /* setattr_prepare() accepts mnt_idmap */
 /* #undef HAVE_SETATTR_PREPARE_IDMAP */
 
 /* setattr_prepare() is available, doesn't accept user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */
 
 /* setattr_prepare() accepts user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_USERNS */
 
 /* iops->set_acl() exists, takes 3 args */
 /* #undef HAVE_SET_ACL */
 
 /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */
 /* #undef HAVE_SET_ACL_IDMAP_DENTRY */
 
 /* iops->set_acl() takes 4 args */
 /* #undef HAVE_SET_ACL_USERNS */
 
 /* iops->set_acl() takes 4 args, arg2 is struct dentry * */
 /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */
 
 /* set_cached_acl() is usable */
 /* #undef HAVE_SET_CACHED_ACL_USABLE */
 
 /* set_special_state() exists */
 /* #undef HAVE_SET_SPECIAL_STATE */
 
 /* struct shrink_control exists */
 /* #undef HAVE_SHRINK_CONTROL_STRUCT */
 
 /* kernel_siginfo_t exists */
 /* #undef HAVE_SIGINFO */
 
 /* signal_stop() exists */
 /* #undef HAVE_SIGNAL_STOP */
 
 /* new shrinker callback wants 2 args */
 /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */
 
 /* cs->count_objects exists */
 /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */
 
 #if defined(__amd64__) || defined(__i386__)
 /* Define if host toolchain supports SSE */
 #define HAVE_SSE 1
 
 /* Define if host toolchain supports SSE2 */
 #define HAVE_SSE2 1
 
 /* Define if host toolchain supports SSE3 */
 #define HAVE_SSE3 1
 
 /* Define if host toolchain supports SSE4.1 */
 #define HAVE_SSE4_1 1
 
 /* Define if host toolchain supports SSE4.2 */
 #define HAVE_SSE4_2 1
 
 /* Define if host toolchain supports SSSE3 */
 #define HAVE_SSSE3 1
 #endif
 
 /* STACK_FRAME_NON_STANDARD is defined */
 /* #undef HAVE_STACK_FRAME_NON_STANDARD */
 
 /* standalone <linux/stdarg.h> exists */
 /* #undef HAVE_STANDALONE_LINUX_STDARG */
 
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdio.h> header file. */
 #define HAVE_STDIO_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the `strlcat' function. */
 #define HAVE_STRLCAT 1
 
 /* Define to 1 if you have the `strlcpy' function. */
 #define HAVE_STRLCPY 1
 
 /* submit_bio is member of struct block_device_operations */
 /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 
 /* super_setup_bdi_name() exits */
 /* #undef HAVE_SUPER_SETUP_BDI_NAME */
 
 /* super_block->s_user_ns exists */
 /* #undef HAVE_SUPER_USER_NS */
 
 /* sync_blockdev() is declared in include/blkdev.h */
 /* #undef HAVE_SYNC_BLOCKDEV */
 
 /* struct kobj_type has default_groups */
 /* #undef HAVE_SYSFS_DEFAULT_GROUPS */
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* i_op->tmpfile() exists */
 /* #undef HAVE_TMPFILE */
 
 /* i_op->tmpfile() uses old dentry signature */
 /* #undef HAVE_TMPFILE_DENTRY */
 
 /* i_op->tmpfile() has mnt_idmap */
 /* #undef HAVE_TMPFILE_IDMAP */
 
 /* i_op->tmpfile() has userns */
 /* #undef HAVE_TMPFILE_USERNS */
 
 /* totalhigh_pages() exists */
 /* #undef HAVE_TOTALHIGH_PAGES */
 
 /* kernel has totalram_pages() */
 /* #undef HAVE_TOTALRAM_PAGES_FUNC */
 
 /* Define to 1 if you have the `udev_device_get_is_initialized' function. */
 /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */
 
 /* kernel has __kernel_fpu_* functions */
 /* #undef HAVE_UNDERSCORE_KERNEL_FPU */
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
 /* iops->getattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_GETATTR */
 
 /* iops->setattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_SETATTR */
 
 /* user_namespace->ns.inum exists */
 /* #undef HAVE_USER_NS_COMMON_INUM */
 
 /* iops->getattr() takes a vfsmount */
 /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */
 
 /* fops->clone_file_range() is available */
 /* #undef HAVE_VFS_CLONE_FILE_RANGE */
 
 /* fops->copy_file_range() is available */
 /* #undef HAVE_VFS_COPY_FILE_RANGE */
 
 /* fops->dedupe_file_range() is available */
 /* #undef HAVE_VFS_DEDUPE_FILE_RANGE */
 
 /* aops->direct_IO() uses iovec */
 /* #undef HAVE_VFS_DIRECT_IO_IOVEC */
 
 /* aops->direct_IO() uses iov_iter without rw */
 /* #undef HAVE_VFS_DIRECT_IO_ITER */
 
 /* aops->direct_IO() uses iov_iter with offset */
 /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */
 
 /* aops->direct_IO() uses iov_iter with rw and offset */
 /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */
 
 /* filemap_dirty_folio exists */
 /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */
 
 /* file_operations_extend takes .copy_file_range() and .clone_file_range() */
 /* #undef HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
 /* generic_copy_file_range() is available */
 /* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
 /* All required iov_iter interfaces are available */
 /* #undef HAVE_VFS_IOV_ITER */
 
 /* fops->iterate() is available */
 /* #undef HAVE_VFS_ITERATE */
 
 /* fops->iterate_shared() is available */
 /* #undef HAVE_VFS_ITERATE_SHARED */
 
 /* fops->readdir() is available */
 /* #undef HAVE_VFS_READDIR */
 
 /* address_space_operations->readpages exists */
 /* #undef HAVE_VFS_READPAGES */
 
 /* read_folio exists */
 /* #undef HAVE_VFS_READ_FOLIO */
 
 /* fops->remap_file_range() is available */
 /* #undef HAVE_VFS_REMAP_FILE_RANGE */
 
 /* fops->read/write_iter() are available */
 /* #undef HAVE_VFS_RW_ITERATE */
 
 /* __set_page_dirty_nobuffers exists */
 /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */
 
 /* __vmalloc page flags exists */
 /* #undef HAVE_VMALLOC_PAGE_KERNEL */
 
 /* yes */
 /* #undef HAVE_WAIT_ON_BIT_ACTION */
 
 /* wait_queue_entry_t exists */
 /* #undef HAVE_WAIT_QUEUE_ENTRY_T */
 
 /* wq_head->head and wq_entry->entry exist */
 /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */
 
 /* int (*writepage_t)() takes struct folio* */
 /* #undef HAVE_WRITEPAGE_T_FOLIO */
 
 /* xattr_handler->get() wants dentry */
 /* #undef HAVE_XATTR_GET_DENTRY */
 
 /* xattr_handler->get() wants both dentry and inode */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE */
 
 /* xattr_handler->get() wants dentry and inode and flags */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */
 
 /* xattr_handler->get() wants xattr_handler */
 /* #undef HAVE_XATTR_GET_HANDLER */
 
 /* xattr_handler has name */
 /* #undef HAVE_XATTR_HANDLER_NAME */
 
 /* xattr_handler->list() wants dentry */
 /* #undef HAVE_XATTR_LIST_DENTRY */
 
 /* xattr_handler->list() wants xattr_handler */
 /* #undef HAVE_XATTR_LIST_HANDLER */
 
 /* xattr_handler->list() wants simple */
 /* #undef HAVE_XATTR_LIST_SIMPLE */
 
 /* xattr_handler->set() wants dentry */
 /* #undef HAVE_XATTR_SET_DENTRY */
 
 /* xattr_handler->set() wants both dentry and inode */
 /* #undef HAVE_XATTR_SET_DENTRY_INODE */
 
 /* xattr_handler->set() wants xattr_handler */
 /* #undef HAVE_XATTR_SET_HANDLER */
 
 /* xattr_handler->set() takes mnt_idmap */
 /* #undef HAVE_XATTR_SET_IDMAP */
 
 /* xattr_handler->set() takes user_namespace */
 /* #undef HAVE_XATTR_SET_USERNS */
 
 /* Define if host toolchain supports XSAVE */
 #define HAVE_XSAVE 1
 
 /* Define if host toolchain supports XSAVEOPT */
 #define HAVE_XSAVEOPT 1
 
 /* Define if host toolchain supports XSAVES */
 #define HAVE_XSAVES 1
 
 /* ZERO_PAGE() is GPL-only */
 /* #undef HAVE_ZERO_PAGE_GPL_ONLY */
 
 /* Define if you have [z] */
 #define HAVE_ZLIB 1
 
 /* __posix_acl_chmod() exists */
 /* #undef HAVE___POSIX_ACL_CHMOD */
 
 /* kernel exports FPU functions */
 /* #undef KERNEL_EXPORTS_X86_FPU */
 
 /* TBD: fetch(3) support */
 #if 0
 /* whether the chosen libfetch is to be loaded at run-time */
 #define LIBFETCH_DYNAMIC 1
 
 /* libfetch is fetch(3) */
 #define LIBFETCH_IS_FETCH 1
 
 /* libfetch is libcurl */
 #define LIBFETCH_IS_LIBCURL 0
 
 /* soname of chosen libfetch */
 #define LIBFETCH_SONAME "libfetch.so.6"
 #endif
 
 /* Define to the sub-directory where libtool stores uninstalled libraries. */
 #define LT_OBJDIR ".libs/"
 
 /* make_request_fn() return type */
 /* #undef MAKE_REQUEST_FN_RET */
 
 /* struct shrink_control has nid */
 /* #undef SHRINK_CONTROL_HAS_NID */
 
 /* using complete_and_exit() instead */
 /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */
 
 /* Defined for legacy compatibility. */
 #define SPL_META_ALIAS ZFS_META_ALIAS
 
 /* Defined for legacy compatibility. */
 #define SPL_META_RELEASE ZFS_META_RELEASE
 
 /* Defined for legacy compatibility. */
 #define SPL_META_VERSION ZFS_META_VERSION
 
 /* pde_data() is PDE_DATA() */
 /* #undef SPL_PDE_DATA */
 
 /* Define to 1 if all of the C90 standard headers exist (not just the ones
    required in a freestanding environment). This macro is provided for
    backward compatibility; new code need not use it. */
 #define SYSTEM_FREEBSD 1
 
 /* True if ZFS is to be compiled for a Linux system */
 /* #undef SYSTEM_LINUX */
 
 /* Version number of package */
 /* #undef ZFS_DEBUG */
 
 /* /dev/zfs minor */
 /* #undef ZFS_DEVICE_MINOR */
 
 /* enum node_stat_item contains NR_FILE_PAGES */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */
 
 /* enum node_stat_item contains NR_INACTIVE_ANON */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */
 
 /* enum node_stat_item contains NR_INACTIVE_FILE */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */
 
 /* enum zone_stat_item contains NR_FILE_PAGES */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */
 
 /* enum zone_stat_item contains NR_INACTIVE_ANON */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */
 
 /* enum zone_stat_item contains NR_INACTIVE_FILE */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */
 
 /* GENHD_FL_EXT_DEVT flag is not available */
 /* #undef ZFS_GENHD_FL_EXT_DEVT */
 
 /* GENHD_FL_NO_PART_SCAN flag is available */
 /* #undef ZFS_GENHD_FL_NO_PART */
 
 /* global_node_page_state() exists */
 /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */
 
 /* global_zone_page_state() exists */
 /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */
 
 /* Define to 1 if GPL-only symbols can be used */
 /* #undef ZFS_IS_GPL_COMPATIBLE */
 
 /* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.99-231-FreeBSD_g688514e47"
+#define ZFS_META_ALIAS "zfs-2.2.99-239-FreeBSD_ga03ebd9be"
 
 /* Define the project author. */
 #define ZFS_META_AUTHOR "OpenZFS"
 
 /* Define the project release date. */
 /* #undef ZFS_META_DATA */
 
 /* Define the maximum compatible kernel version. */
 #define ZFS_META_KVER_MAX "6.6"
 
 /* Define the minimum compatible kernel version. */
 #define ZFS_META_KVER_MIN "3.10"
 
 /* Define the project license. */
 #define ZFS_META_LICENSE "CDDL"
 
 /* Define the libtool library 'age' version information. */
 /* #undef ZFS_META_LT_AGE */
 
 /* Define the libtool library 'current' version information. */
 /* #undef ZFS_META_LT_CURRENT */
 
 /* Define the libtool library 'revision' version information. */
 /* #undef ZFS_META_LT_REVISION */
 
 /* Define the project name. */
 #define ZFS_META_NAME "zfs"
 
 /* Define the project release. */
-#define ZFS_META_RELEASE "231-FreeBSD_g688514e47"
+#define ZFS_META_RELEASE "239-FreeBSD_ga03ebd9be"
 
 /* Define the project version. */
 #define ZFS_META_VERSION "2.2.99"
 
 /* count is located in percpu_ref.data */
 /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index 3dc0d7e7eaa5..12c5321568cd 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define	ZFS_META_GITREV "zfs-2.2.99-231-g688514e47"
+#define	ZFS_META_GITREV "zfs-2.2.99-239-ga03ebd9be"