diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 7871eacc2e18..7eed5f48b989 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -1,1150 +1,1150 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/inttypes.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_compress.h>
 #include <sys/uio.h>
 #include <sys/zfs_file.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 struct dsl_crypto_params;
 struct locked_range;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_ENCRYPTED 0x20
 #define	DMU_OT_BYTESWAP_MASK 0x1f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
  * (dmu_object_byteswap_t). All of the types created by this method
  * are cached in the dbuf metadata cache.
  */
 #define	DMU_OT(byteswap, metadata, encrypted) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((encrypted) ? DMU_OT_ENCRYPTED : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
 
 /*
  * MDB doesn't have dmu_ot; it defines these macros itself.
  */
 #ifndef ZFS_MDB
 #define	DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata)
 #define	DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt)
 #define	DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap)
 #endif
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	(((ot) & DMU_OT_METADATA) != 0) : \
 	DMU_OT_IS_METADATA_IMPL(ot))
 
 #define	DMU_OT_IS_DDT(ot) \
 	((ot) == DMU_OT_DDT_ZAP)
 
-#define	DMU_OT_IS_CRITICAL(ot) \
+#define	DMU_OT_IS_CRITICAL(ot, level) \
 	(DMU_OT_IS_METADATA(ot) && \
-	(ot) != DMU_OT_DNODE && \
+	((ot) != DMU_OT_DNODE || (level) > 0) && \
 	(ot) != DMU_OT_DIRECTORY_CONTENTS && \
 	(ot) != DMU_OT_SA)
 
 /* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */
 #define	DMU_OT_IS_FILE(ot) \
 	((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER)
 
 #define	DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	(((ot) & DMU_OT_ENCRYPTED) != 0) : \
 	DMU_OT_IS_ENCRYPTED_IMPL(ot))
 
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	DMU_OT_BYTESWAP_IMPL(ot))
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE),
 
 	DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE),
 	DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE),
 	DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE),
 	DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE),
 	DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE),
 	DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE),
 	DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE),
 	DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE),
 	DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE),
 	DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE),
 } dmu_object_type_t;
 
 /*
  * These flags are for the dmu_tx_assign() function and describe what to do if
  * the transaction is full. See the comment above dmu_tx_assign() for more
  * details on the meaning of these flags.
  */
 #define	DMU_TX_NOWAIT		(0ULL)
 #define	DMU_TX_WAIT		(1ULL<<0)
 #define	DMU_TX_NOTHROTTLE	(1ULL<<1)
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 #define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 #define	DMU_PROJECTUSED_OBJECT	(-3ULL)
 
 /*
  * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT.
  */
 #define	DMU_OBJACCT_PREFIX	"obj-"
 #define	DMU_OBJACCT_PREFIX_LEN	4
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg,
     cred_t *cr, dmu_tx_t *tx);
 
 int dmu_objset_hold(const char *name, const void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, boolean_t key_required, const void *tag,
     objset_t **osp);
 void dmu_objset_rele(objset_t *os, const void *tag);
 void dmu_objset_disown(objset_t *os, boolean_t key_required, const void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func,
     void *arg);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_LOG		"DDT-log-%s-%u"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_DDT_DIR		"DDT-%s"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_ERRORSCRUB		"error_scrub"
 #define	DMU_POOL_LAST_SCRUBBED_TXG	"last_scrubbed_txg"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 #define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 #define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 #define	DMU_POOL_REMOVING		"com.delphix:removing"
 #define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
 #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
 #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
 #define	DMU_POOL_LOG_SPACEMAP_ZAP	"com.delphix:log_spacemap_zap"
 #define	DMU_POOL_DELETED_CLONES		"com.delphix:deleted_clones"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize,
     int indirect_blockshift,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot,
     int blocksize, int indirect_blockshift, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, dnode_t **allocated_dnode, const void *tag,
     dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len,
     int dnodesize, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
     dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
     int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx);
 int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the number of levels on a dnode. nlevels must be greater than the
  * current number of levels or an EINVAL will be returned.
  */
 int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels,
     dmu_tx_t *tx);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allocated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly
  * to accommodate the change. When calling this function, the caller must
  * ensure that the object's nlevels can sufficiently support the new maxblkid.
  */
 int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 /*
  * Get an estimated cache size for an object. Caller must expect races.
  */
 int dmu_object_cached_size(objset_t *os, uint64_t object,
     uint64_t *l1sz, uint64_t *l2sz);
 
 void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 #define	WP_DIRECT_WR	0x8
 
 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 
 /*
  * DB_RF_* are to be used for dbuf_read() or in limited other cases.
  */
 typedef enum dmu_flags {
 	DB_RF_MUST_SUCCEED	= 0,	  /* Suspend on I/O errors. */
 	DB_RF_CANFAIL		= 1 << 0, /* Return on I/O errors. */
 	DB_RF_HAVESTRUCT	= 1 << 1, /* dn_struct_rwlock is locked. */
 	DB_RF_NEVERWAIT		= 1 << 2,
 	DMU_READ_PREFETCH	= 0,	  /* Try speculative prefetch. */
 	DMU_READ_NO_PREFETCH	= 1 << 3, /* Don't prefetch speculatively. */
 	DB_RF_NOPREFETCH	= DMU_READ_NO_PREFETCH,
 	DMU_READ_NO_DECRYPT	= 1 << 4, /* Don't decrypt. */
 	DB_RF_NO_DECRYPT	= DMU_READ_NO_DECRYPT,
 	DMU_DIRECTIO		= 1 << 5, /* Bypass ARC. */
 	DMU_UNCACHEDIO		= 1 << 6, /* Reduce caching. */
 	DMU_PARTIAL_FIRST	= 1 << 7, /* First partial access. */
 	DMU_PARTIAL_MORE	= 1 << 8, /* Following partial access. */
 	DMU_KEEP_CACHING	= 1 << 9, /* Don't affect caching. */
 } dmu_flags_t;
 
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_will_dirty()
  * before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release what you hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag,
     dmu_buf_t **dbp);
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     dmu_flags_t flags);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags,
     const void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags,
     const void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You must not access the dmu_buf_t after releasing
  * what you hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **, dmu_flags_t flags);
 int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp);
 int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, dmu_flags_t flags);
 int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp, dmu_flags_t flags);
 int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, const void *tag,
     dmu_buf_t **dbp);
 
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
  */
 void dmu_buf_add_ref(dmu_buf_t *db, const void *tag);
 
 /*
  * Attempt to add a reference to a dmu buffer that is in an unknown state,
  * using a pointer that may have been invalidated by eviction processing.
  * The request will succeed if the passed in dbuf still represents the
  * same os/object/blkid, is ineligible for eviction, and has at least
  * one hold by a user other than the syncer.
  */
 boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
     uint64_t blkid, const void *tag);
 
 void dmu_buf_rele(dmu_buf_t *db, const void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 uint64_t dmu_buf_user_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, const void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
 
 /*
  * A DMU buffer user object may be associated with a dbuf for the
  * duration of its lifetime.  This allows the user of a dbuf (client)
  * to attach private data to a dbuf (e.g. in-core only data such as a
  * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
  * when that dbuf has been evicted.  Clients typically respond to the
  * eviction notification by freeing their private data, thus ensuring
  * the same lifetime for both dbuf and private data.
  *
  * The mapping from a dmu_buf_user_t to any client private data is the
  * client's responsibility.  All current consumers of the API with private
  * data embed a dmu_buf_user_t as the first member of the structure for
  * their private data.  This allows conversions between the two types
  * with a simple cast.  Since the DMU buf user API never needs access
  * to the private data, other strategies can be employed if necessary
  * or convenient for the client (e.g. using container_of() to do the
  * conversion for private data that cannot have the dmu_buf_user_t as
  * its first member).
  *
  * Eviction callbacks are executed without the dbuf mutex held or any
  * other type of mechanism to guarantee that the dbuf is still available.
  * For this reason, users must assume the dbuf has already been freed
  * and not reference the dbuf from the callback context.
  *
  * Users requesting "immediate eviction" are notified as soon as the dbuf
  * is only referenced by dirty records (dirties == holds).  Otherwise the
  * notification occurs after eviction processing for the dbuf begins.
  */
 typedef struct dmu_buf_user {
 	/*
 	 * Asynchronous user eviction callback state.
 	 */
 	taskq_ent_t	dbu_tqent;
 
 	/* Size of user data, for inclusion in dbuf_cache accounting. */
 	uint64_t	dbu_size;
 
 	/*
 	 * This instance's eviction function pointers.
 	 *
 	 * dbu_evict_func_sync is called synchronously and then
 	 * dbu_evict_func_async is executed asynchronously on a taskq.
 	 */
 	dmu_buf_evict_func_t *dbu_evict_func_sync;
 	dmu_buf_evict_func_t *dbu_evict_func_async;
 #ifdef ZFS_DEBUG
 	/*
 	 * Pointer to user's dbuf pointer.  NULL for clients that do
 	 * not associate a dbuf with their user data.
 	 *
 	 * The dbuf pointer is cleared upon eviction so as to catch
 	 * use-after-evict bugs in clients.
 	 */
 	dmu_buf_t **dbu_clear_on_evict_dbufp;
 #endif
 } dmu_buf_user_t;
 
 /*
  * Initialize the given dmu_buf_user_t instance with the eviction function
  * evict_func, to be called when the user is evicted.
  *
  * NOTE: This function should only be called once on a given dmu_buf_user_t.
  *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
 static inline void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
     dmu_buf_evict_func_t *evict_func_async,
     dmu_buf_t **clear_on_evict_dbufp __maybe_unused)
 {
 	ASSERT(dbu->dbu_evict_func_sync == NULL);
 	ASSERT(dbu->dbu_evict_func_async == NULL);
 
 	/* must have at least one evict func */
 	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
 	dbu->dbu_evict_func_sync = evict_func_sync;
 	dbu->dbu_evict_func_async = evict_func_async;
 	taskq_init_ent(&dbu->dbu_tqent);
 #ifdef ZFS_DEBUG
 	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
 #endif
 }
 
 /*
  * Attach user data to a dbuf and mark it for normal (when the dbuf's
  * data is cleared or its reference count goes to zero) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Attach user data to a dbuf and mark it for immediate (its dirty and
  * reference counts are equal) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Replace the current user of a dbuf.
  *
  * If given the current user of a dbuf, replaces the dbuf's user with
  * "new_user" and returns the user data pointer that was replaced.
  * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
 void *dmu_buf_replace_user(dmu_buf_t *db,
     dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
 
 /*
  * Remove the specified user data for a DMU buffer.
  *
  * Returns the user that was removed on success, or the current user if
  * another user currently owns the buffer.
  */
 void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * User data size accounting. This can be used to artifically inflate the size
  * of the dbuf during cache accounting, so that dbuf_evict_thread evicts enough
  * to satisfy memory reclaim requests. It's not used for anything else, and
  * defaults to 0.
  */
 uint64_t dmu_buf_user_size(dmu_buf_t *db);
 void dmu_buf_add_user_size(dmu_buf_t *db, uint64_t nadd);
 void dmu_buf_sub_user_size(dmu_buf_t *db, uint64_t nsub);
 
 /*
  * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_dirty_flags(dmu_buf_t *db, dmu_tx_t *tx, dmu_flags_t flags);
 boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
     const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t flags);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  *
  * When multiple callbacks are registered to the transaction, the callbacks
  * will be called in reverse order to let Lustre, the only user of commit
  * callback currently, take the fast path of its commit callback handling.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 void dmu_tx_do_callbacks(list_t *cb_list, int error);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, dmu_flags_t flags);
 int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     dmu_flags_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx);
 int dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx, dmu_flags_t flags);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx);
 #ifdef _KERNEL
 int dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_flags_t flags);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_flags_t flags);
 int dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
     dmu_flags_t flags);
 int dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx, dmu_flags_t flags);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx, dmu_flags_t flags);
 int dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
 	dmu_tx_t *tx, dmu_flags_t flags);
 #endif
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
 int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset,
     struct arc_buf *buf, dmu_tx_t *tx, dmu_flags_t flags);
 #define	dmu_assign_arcbuf	dmu_assign_arcbuf_by_dbuf
 extern uint_t zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
 	uint64_t len, enum zio_priority pri);
 void dmu_prefetch_dnode(objset_t *os, uint64_t object, enum zio_priority pri);
 int dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_dnodesize;
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void (*const arc_byteswap_func_t)(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
 	boolean_t		ot_dbuf_metadata_cache;
 	boolean_t		ot_encrypt;
 	const char		*ot_name;
 } dmu_object_type_info_t;
 
 typedef const struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	 ob_func;
 	const char		*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	uint8_t dds_redacted;
 	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 inode_timespec_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern uint64_t dmu_objset_dnodesize(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_objset_blksize(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val);
 extern int dmu_snapshot_realname(objset_t *os, const char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef struct zfs_file_info {
 	uint64_t zfi_user;
 	uint64_t zfi_group;
 	uint64_t zfi_project;
 	uint64_t zfi_generation;
 } zfs_file_info_t;
 
 typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data,
     struct zfs_file_info *zoi);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     file_info_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
 	struct lwb	*zgd_lwb;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct zfs_locked_range *zgd_lr;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, struct blkptr *bps, size_t *nbpsp);
 int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     zfs_file_t *fp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern uint_t dmu_prefetch_max;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7
index 42ce1fea62b8..b7b1047a8a7b 100644
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@@ -1,2279 +1,2280 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
 .\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
 .\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
 .\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
 .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\" Copyright (c) 2019, Kjeld Schouten-Lebbing
 .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
 .\"
 .Dd June 29, 2024
 .Dt ZFSPROPS 7
 .Os
 .
 .Sh NAME
 .Nm zfsprops
 .Nd native and user-defined properties of ZFS datasets
 .
 .Sh DESCRIPTION
 Properties are divided into two types, native properties and user-defined
 .Po or
 .Qq user
 .Pc
 properties.
 Native properties either export internal statistics or control ZFS behavior.
 In addition, native properties are either editable or read-only.
 User properties have no effect on ZFS behavior, but you can use them to annotate
 datasets in a way that is meaningful in your environment.
 For more information about user properties, see the
 .Sx User Properties
 section, below.
 .
 .Ss Native Properties
 Every dataset has a set of properties that export statistics about the dataset
 as well as control various behaviors.
 Properties are inherited from the parent unless overridden by the child.
 Some properties apply only to certain types of datasets
 .Pq file systems, volumes, or snapshots .
 .Pp
 The values of numeric properties can be specified using human-readable suffixes
 .Po for example,
 .Sy k ,
 .Sy KB ,
 .Sy M ,
 .Sy Gb ,
 and so forth, up to
 .Sy Z
 for zettabyte
 .Pc .
 The following are all valid
 .Pq and equal
 specifications:
 .Li 1536M ,
 .Li 1.5g ,
 .Li 1.50GB .
 .Pp
 The values of non-numeric properties are case sensitive and must be lowercase,
 except for
 .Sy mountpoint ,
 .Sy sharenfs ,
 and
 .Sy sharesmb .
 .Pp
 The following native properties consist of read-only statistics about the
 dataset.
 These properties can be neither set, nor inherited.
 Native properties apply to all dataset types unless otherwise noted.
 .Bl -tag -width "usedbyrefreservation"
 .It Sy available
 The amount of space available to the dataset and all its children, assuming that
 there is no other activity in the pool.
 Because space is shared within a pool, availability can be limited by any number
 of factors, including physical pool size, quotas, reservations, or other
 datasets within the pool.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy avail .
 .It Sy compressratio
 For non-snapshots, the compression ratio achieved for the
 .Sy used
 space of this dataset, expressed as a multiplier.
 The
 .Sy used
 property includes descendant datasets, and, for clones, does not include the
 space shared with the origin snapshot.
 For snapshots, the
 .Sy compressratio
 is the same as the
 .Sy refcompressratio
 property.
 Compression can be turned on by running:
 .Nm zfs Cm set Sy compression Ns = Ns Sy on Ar dataset .
 The default value is
 .Sy off .
 .It Sy createtxg
 The transaction group (txg) in which the dataset was created.
 Bookmarks have the same
 .Sy createtxg
 as the snapshot they are initially tied to.
 This property is suitable for ordering a list of snapshots,
 e.g. for incremental send and receive.
 .It Sy creation
 The time this dataset was created.
 .It Sy clones
 For snapshots, this property is a comma-separated list of filesystems or volumes
 which are clones of this snapshot.
 The clones'
 .Sy origin
 property is this snapshot.
 If the
 .Sy clones
 property is not empty, then this snapshot can not be destroyed
 .Po even with the
 .Fl r
 or
 .Fl f
 options
 .Pc .
 The roles of origin and clone can be swapped by promoting the clone with the
 .Nm zfs Cm promote
 command.
 .It Sy defer_destroy
 This property is
 .Sy on
 if the snapshot has been marked for deferred destroy by using the
 .Nm zfs Cm destroy Fl d
 command.
 Otherwise, the property is
 .Sy off .
 .It Sy encryptionroot
 For encrypted datasets, indicates where the dataset is currently inheriting its
 encryption key from.
 Loading or unloading a key for the
 .Sy encryptionroot
 will implicitly load / unload the key for any inheriting datasets (see
 .Nm zfs Cm load-key
 and
 .Nm zfs Cm unload-key
 for details).
 Clones will always share an
 encryption key with their origin.
 See the
 .Sx Encryption
 section of
 .Xr zfs-load-key 8
 for details.
 .It Sy filesystem_count
 The total number of filesystems and volumes that exist under this location in
 the dataset tree.
 This value is only available when a
 .Sy filesystem_limit
 has been set somewhere in the tree under which the dataset resides.
 .It Sy keystatus
 Indicates if an encryption key is currently loaded into ZFS.
 The possible values are
 .Sy none ,
 .Sy available ,
 and
 .Sy unavailable .
 See
 .Nm zfs Cm load-key
 and
 .Nm zfs Cm unload-key .
 .It Sy guid
 The 64 bit GUID of this dataset or bookmark which does not change over its
 entire lifetime.
 When a snapshot is sent to another pool, the received snapshot has the same
 GUID.
 Thus, the
 .Sy guid
 is suitable to identify a snapshot across pools.
 .It Sy logicalreferenced
 The amount of space that is
 .Qq logically
 accessible by this dataset.
 See the
 .Sy referenced
 property.
 The logical space ignores the effect of the
 .Sy compression
 and
 .Sy copies
 properties, giving a quantity closer to the amount of data that applications
 see.
 However, it does include space consumed by metadata.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy lrefer .
 .It Sy logicalused
 The amount of space that is
 .Qq logically
 consumed by this dataset and all its descendants.
 See the
 .Sy used
 property.
 The logical space ignores the effect of the
 .Sy compression
 and
 .Sy copies
 properties, giving a quantity closer to the amount of data that applications
 see.
 However, it does include space consumed by metadata.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy lused .
 .It Sy mounted
 For file systems, indicates whether the file system is currently mounted.
 This property can be either
 .Sy yes
 or
 .Sy no .
 .It Sy objsetid
 A unique identifier for this dataset within the pool.
 Unlike the dataset's
 .Sy guid , No the Sy objsetid
 of a dataset is not transferred to other pools when the snapshot is copied
 with a send/receive operation.
 The
 .Sy objsetid
 can be reused (for a new dataset) after the dataset is deleted.
 .It Sy origin
 For cloned file systems or volumes, the snapshot from which the clone was
 created.
 See also the
 .Sy clones
 property.
 .It Sy receive_resume_token
 For filesystems or volumes which have saved partially-completed state from
 .Nm zfs Cm receive Fl s ,
 this opaque token can be provided to
 .Nm zfs Cm send Fl t
 to resume and complete the
 .Nm zfs Cm receive .
 .It Sy redact_snaps
 For bookmarks, this is the list of snapshot GUIDs the bookmark contains a
 redaction
 list for.
 For snapshots, this is the list of snapshot GUIDs the snapshot is redacted with
 respect to.
 .It Sy referenced
 The amount of data that is accessible by this dataset, which may or may not be
 shared with other datasets in the pool.
 When a snapshot or clone is created, it initially references the same amount of
 space as the file system or snapshot it was created from, since its contents are
 identical.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy refer .
 .It Sy refcompressratio
 The compression ratio achieved for the
 .Sy referenced
 space of this dataset, expressed as a multiplier.
 See also the
 .Sy compressratio
 property.
 .It Sy snapshot_count
 The total number of snapshots that exist under this location in the dataset
 tree.
 This value is only available when a
 .Sy snapshot_limit
 has been set somewhere in the tree under which the dataset resides.
 .It Sy type
 The type of dataset:
 .Sy filesystem ,
 .Sy volume ,
 .Sy snapshot ,
 or
 .Sy bookmark .
 .It Sy used
 The amount of space consumed by this dataset and all its descendants.
 This is the value that is checked against this dataset's quota and reservation.
 The space used does not include this dataset's reservation, but does take into
 account the reservations of any descendent datasets.
 The amount of space that a dataset consumes from its parent, as well as the
 amount of space that is freed if this dataset is recursively destroyed, is the
 greater of its space used and its reservation.
 .Pp
 The used space of a snapshot
 .Po see the
 .Sx Snapshots
 section of
 .Xr zfsconcepts 7
 .Pc
 is space that is referenced exclusively by this snapshot.
 If this snapshot is destroyed, the amount of
 .Sy used
 space will be freed.
 Space that is shared by multiple snapshots isn't accounted for in this metric.
 When a snapshot is destroyed, space that was previously shared with this
 snapshot can become unique to snapshots adjacent to it, thus changing the used
 space of those snapshots.
 The used space of the latest snapshot can also be affected by changes in the
 file system.
 Note that the
 .Sy used
 space of a snapshot is a subset of the
 .Sy written
 space of the snapshot.
 .Pp
 The amount of space used, available, or referenced does not take into account
 pending changes.
 Pending changes are generally accounted for within a few seconds.
 Committing a change to a disk using
 .Xr fsync 2
 or
 .Sy O_SYNC
 does not necessarily guarantee that the space usage information is updated
 immediately.
 .It Sy usedby*
 The
 .Sy usedby*
 properties decompose the
 .Sy used
 properties into the various reasons that space is used.
 Specifically,
 .Sy used No =
 .Sy usedbychildren No +
 .Sy usedbydataset No +
 .Sy usedbyrefreservation No +
 .Sy usedbysnapshots .
 These properties are only available for datasets created on
 .Nm zpool
 .Qo version 13 Qc
 pools.
 .It Sy usedbychildren
 The amount of space used by children of this dataset, which would be freed if
 all the dataset's children were destroyed.
 .It Sy usedbydataset
 The amount of space used by this dataset itself, which would be freed if the
 dataset were destroyed
 .Po after first removing any
 .Sy refreservation
 and destroying any necessary snapshots or descendants
 .Pc .
 .It Sy usedbyrefreservation
 The amount of space used by a
 .Sy refreservation
 set on this dataset, which would be freed if the
 .Sy refreservation
 was removed.
 .It Sy usedbysnapshots
 The amount of space consumed by snapshots of this dataset.
 In particular, it is the amount of space that would be freed if all of this
 dataset's snapshots were destroyed.
 Note that this is not simply the sum of the snapshots'
 .Sy used
 properties because space can be shared by multiple snapshots.
 .It Sy userused Ns @ Ns Ar user
 The amount of space consumed by the specified user in this dataset.
 Space is charged to the owner of each file, as displayed by
 .Nm ls Fl l .
 The amount of space charged is displayed by
 .Nm du No and Nm ls Fl s .
 See the
 .Nm zfs Cm userspace
 command for more information.
 .Pp
 Unprivileged users can access only their own space usage.
 The root user, or a user who has been granted the
 .Sy userused
 privilege with
 .Nm zfs Cm allow ,
 can access everyone's usage.
 .Pp
 The
 .Sy userused Ns @ Ns Ar …
 properties are not displayed by
 .Nm zfs Cm get Sy all .
 The user's name must be appended after the
 .Sy @
 symbol, using one of the following forms:
 .Bl -bullet -compact -offset 4n
 .It
 POSIX name
 .Pq Qq joe
 .It
 POSIX numeric ID
 .Pq Qq 789
 .It
 SID name
 .Pq Qq joe.smith@mydomain
 .It
 SID numeric ID
 .Pq Qq S-1-123-456-789
 .El
 .Pp
 Files created on Linux always have POSIX owners.
 .It Sy userobjused Ns @ Ns Ar user
 The
 .Sy userobjused
 property is similar to
 .Sy userused
 but instead it counts the number of objects consumed by a user.
 This property counts all objects allocated on behalf of the user,
 it may differ from the results of system tools such as
 .Nm df Fl i .
 .Pp
 When the property
 .Sy xattr Ns = Ns Sy on
 is set on a file system additional objects will be created per-file to store
 extended attributes.
 These additional objects are reflected in the
 .Sy userobjused
 value and are counted against the user's
 .Sy userobjquota .
 When a file system is configured to use
 .Sy xattr Ns = Ns Sy sa
 no additional internal objects are normally required.
 .It Sy userrefs
 This property is set to the number of user holds on this snapshot.
 User holds are set by using the
 .Nm zfs Cm hold
 command.
 .It Sy groupused Ns @ Ns Ar group
 The amount of space consumed by the specified group in this dataset.
 Space is charged to the group of each file, as displayed by
 .Nm ls Fl l .
 See the
 .Sy userused Ns @ Ns Ar user
 property for more information.
 .Pp
 Unprivileged users can only access their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy groupused
 privilege with
 .Nm zfs Cm allow ,
 can access all groups' usage.
 .It Sy groupobjused Ns @ Ns Ar group
 The number of objects consumed by the specified group in this dataset.
 Multiple objects may be charged to the group for each file when extended
 attributes are in use.
 See the
 .Sy userobjused Ns @ Ns Ar user
 property for more information.
 .Pp
 Unprivileged users can only access their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy groupobjused
 privilege with
 .Nm zfs Cm allow ,
 can access all groups' usage.
 .It Sy projectused Ns @ Ns Ar project
 The amount of space consumed by the specified project in this dataset.
 Project is identified via the project identifier (ID) that is object-based
 numeral attribute.
 An object can inherit the project ID from its parent object (if the
 parent has the flag of inherit project ID that can be set and changed via
 .Nm chattr Fl /+P
 or
 .Nm zfs project Fl s )
 when being created.
 The privileged user can set and change object's project
 ID via
 .Nm chattr Fl p
 or
 .Nm zfs project Fl s
 anytime.
 Space is charged to the project of each file, as displayed by
 .Nm lsattr Fl p
 or
 .Nm zfs project .
 See the
 .Sy userused Ns @ Ns Ar user
 property for more information.
 .Pp
 The root user, or a user who has been granted the
 .Sy projectused
 privilege with
 .Nm zfs allow ,
 can access all projects' usage.
 .It Sy projectobjused Ns @ Ns Ar project
 The
 .Sy projectobjused
 is similar to
 .Sy projectused
 but instead it counts the number of objects consumed by project.
 When the property
 .Sy xattr Ns = Ns Sy on
 is set on a fileset, ZFS will create additional objects per-file to store
 extended attributes.
 These additional objects are reflected in the
 .Sy projectobjused
 value and are counted against the project's
 .Sy projectobjquota .
 When a filesystem is configured to use
 .Sy xattr Ns = Ns Sy sa
 no additional internal objects are required.
 See the
 .Sy userobjused Ns @ Ns Ar user
 property for more information.
 .Pp
 The root user, or a user who has been granted the
 .Sy projectobjused
 privilege with
 .Nm zfs allow ,
 can access all projects' objects usage.
 .It Sy snapshots_changed
 Provides a mechanism to quickly determine whether snapshot list has
 changed without having to mount a dataset or iterate the snapshot list.
 Specifies the time at which a snapshot for a dataset was last
 created or deleted.
 .Pp
 This allows us to be more efficient how often we query snapshots.
 The property is persistent across mount and unmount operations only if the
 .Sy extensible_dataset
 feature is enabled.
 .It Sy volblocksize
 For volumes, specifies the block size of the volume.
 The
 .Sy blocksize
 cannot be changed once the volume has been written, so it should be set at
 volume creation time.
 The default
 .Sy blocksize
 for volumes is 16 KiB.
 Any power of 2 from 512 bytes to 128 KiB is valid.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy volblock .
 .It Sy written
 The amount of space
 .Sy referenced
 by this dataset, that was written since the previous snapshot
 .Pq i.e. that is not referenced by the previous snapshot .
 .It Sy written Ns @ Ns Ar snapshot
 The amount of
 .Sy referenced
 space written to this dataset since the specified snapshot.
 This is the space that is referenced by this dataset but was not referenced by
 the specified snapshot.
 .Pp
 The
 .Ar snapshot
 may be specified as a short snapshot name
 .Pq just the part after the Sy @ ,
 in which case it will be interpreted as a snapshot in the same filesystem as
 this dataset.
 The
 .Ar snapshot
 may be a full snapshot name
 .Pq Ar filesystem Ns @ Ns Ar snapshot ,
 which for clones may be a snapshot in the origin's filesystem
 .Pq or the origin of the origin's filesystem, etc.
 .El
 .Pp
 The following native properties can be used to change the behavior of a ZFS
 dataset.
 .Bl -tag -width ""
 .It Xo
 .Sy aclinherit Ns = Ns Sy discard Ns | Ns Sy noallow Ns | Ns
 .Sy restricted Ns | Ns Sy passthrough Ns | Ns Sy passthrough-x
 .Xc
 Controls how ACEs are inherited when files and directories are created.
 .Bl -tag -compact -offset 4n -width "passthrough-x"
 .It Sy discard
 does not inherit any ACEs.
 .It Sy noallow
 only inherits inheritable ACEs that specify
 .Qq deny
 permissions.
 .It Sy restricted
 default, removes the
 .Sy write_acl
 and
 .Sy write_owner
 permissions when the ACE is inherited.
 .It Sy passthrough
 inherits all inheritable ACEs without any modifications.
 .It Sy passthrough-x
 same meaning as
 .Sy passthrough ,
 except that the
 .Sy owner@ , group@ , No and Sy everyone@
 ACEs inherit the execute permission only if the file creation mode also requests
 the execute bit.
 .El
 .Pp
 When the property value is set to
 .Sy passthrough ,
 files are created with a mode determined by the inheritable ACEs.
 If no inheritable ACEs exist that affect the mode, then the mode is set in
 accordance to the requested mode from the application.
 .Pp
 The
 .Sy aclinherit
 property does not apply to POSIX ACLs.
 .It Xo
 .Sy aclmode Ns = Ns Sy discard Ns | Ns Sy groupmask Ns | Ns
 .Sy passthrough Ns | Ns Sy restricted Ns
 .Xc
 Controls how an ACL is modified during chmod(2) and how inherited ACEs
 are modified by the file creation mode:
 .Bl -tag -compact -offset 4n -width "passthrough"
 .It Sy discard
 default, deletes all
 .Sy ACEs
 except for those representing
 the mode of the file or directory requested by
 .Xr chmod 2 .
 .It Sy groupmask
 reduces permissions granted in all
 .Sy ALLOW
 entries found in the
 .Sy ACL
 such that they are no greater than the group permissions specified by
 .Xr chmod 2 .
 .It Sy passthrough
 indicates that no changes are made to the ACL other than creating or updating
 the necessary ACL entries to represent the new mode of the file or directory.
 .It Sy restricted
 will cause the
 .Xr chmod 2
 operation to return an error when used on any file or directory which has
 a non-trivial ACL whose entries can not be represented by a mode.
 .Xr chmod 2
 is required to change the set user ID, set group ID, or sticky bits on a file
 or directory, as they do not have equivalent ACL entries.
 In order to use
 .Xr chmod 2
 on a file or directory with a non-trivial ACL when
 .Sy aclmode
 is set to
 .Sy restricted ,
 you must first remove all ACL entries which do not represent the current mode.
 .El
 .It Sy acltype Ns = Ns Sy off Ns | Ns Sy nfsv4 Ns | Ns Sy posix
 Controls whether ACLs are enabled and if so what type of ACL to use.
 When this property is set to a type of ACL not supported by the current
 platform, the behavior is the same as if it were set to
 .Sy off .
 .Bl -tag -compact -offset 4n -width "posixacl"
 .It Sy off
 default on Linux, when a file system has the
 .Sy acltype
 property set to off then ACLs are disabled.
 .It Sy noacl
 an alias for
 .Sy off
 .It Sy nfsv4
 default on
 .Fx ,
 indicates that NFSv4-style ZFS ACLs should be used.
 These ACLs can be managed with the
 .Xr getfacl 1
 and
 .Xr setfacl 1 .
 The
 .Sy nfsv4
 ZFS ACL type is not yet supported on Linux.
 .It Sy posix
 indicates POSIX ACLs should be used.
 POSIX ACLs are specific to Linux and are not functional on other platforms.
 POSIX ACLs are stored as an extended
 attribute and therefore will not overwrite any existing NFSv4 ACLs which
 may be set.
 .It Sy posixacl
 an alias for
 .Sy posix
 .El
 .Pp
 To obtain the best performance when setting
 .Sy posix
 users are strongly encouraged to set the
 .Sy xattr Ns = Ns Sy sa
 property.
 This will result in the POSIX ACL being stored more efficiently on disk.
 But as a consequence, all new extended attributes will only be
 accessible from OpenZFS implementations which support the
 .Sy xattr Ns = Ns Sy sa
 property.
 See the
 .Sy xattr
 property for more details.
 .It Sy atime Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the access time for files is updated when they are read.
 Turning this property off avoids producing write traffic when reading files and
 can result in significant performance gains, though it might confuse mailers
 and other similar utilities.
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy atime
 and
 .Sy noatime
 mount options.
 The default value is
 .Sy on .
 See also
 .Sy relatime
 below.
 .It Sy canmount Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy noauto
 If this property is set to
 .Sy off ,
 the file system cannot be mounted, and is ignored by
 .Nm zfs Cm mount Fl a .
 Setting this property to
 .Sy off
 is similar to setting the
 .Sy mountpoint
 property to
 .Sy none ,
 except that the dataset still has a normal
 .Sy mountpoint
 property, which can be inherited.
 Setting this property to
 .Sy off
 allows datasets to be used solely as a mechanism to inherit properties.
 One example of setting
 .Sy canmount Ns = Ns Sy off
 is to have two datasets with the same
 .Sy mountpoint ,
 so that the children of both datasets appear in the same directory, but might
 have different inherited characteristics.
 .Pp
 When set to
 .Sy noauto ,
 a dataset can only be mounted and unmounted explicitly.
 The dataset is not mounted automatically when the dataset is created or
 imported, nor is it mounted by the
 .Nm zfs Cm mount Fl a
 command or unmounted by the
 .Nm zfs Cm unmount Fl a
 command.
 .Pp
 This property is not inherited.
 .It Xo
 .Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns
 .Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns
 .Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr Ns | Ns Sy blake3
 .Xc
 Controls the checksum used to verify data integrity.
 The default value is
 .Sy on ,
 which automatically selects an appropriate algorithm
 .Po currently,
 .Sy fletcher4 ,
 but this may change in future releases
 .Pc .
 The value
 .Sy off
 disables integrity checking on user data.
 The value
 .Sy noparity
 not only disables integrity but also disables maintaining parity for user data.
 This setting is used internally by a dump device residing on a RAID-Z pool and
 should not be used by any other dataset.
 Disabling checksums is
 .Em NOT
 a recommended practice.
 .Pp
 The
 .Sy sha512 ,
 .Sy skein ,
 .Sy edonr ,
 and
 .Sy blake3
 checksum algorithms require enabling the appropriate features on the pool.
 .Pp
 Please see
 .Xr zpool-features 7
 for more information on these algorithms.
 .Pp
 Changing this property affects only newly-written data.
 .It Xo
 .Sy compression Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy gzip Ns | Ns
 .Sy gzip- Ns Ar N Ns | Ns Sy lz4 Ns | Ns Sy lzjb Ns | Ns Sy zle Ns | Ns Sy zstd Ns | Ns
 .Sy zstd- Ns Ar N Ns | Ns Sy zstd-fast Ns | Ns Sy zstd-fast- Ns Ar N
 .Xc
 Controls the compression algorithm used for this dataset.
 .Pp
 When set to
 .Sy on
 (the default), indicates that the current default compression algorithm should
 be used.
 The default balances compression and decompression speed, with compression ratio
 and is expected to work well on a wide variety of workloads.
 Unlike all other settings for this property,
 .Sy on
 does not select a fixed compression type.
 As new compression algorithms are added to ZFS and enabled on a pool, the
 default compression algorithm may change.
 The current default compression algorithm is either
 .Sy lzjb
 or, if the
 .Sy lz4_compress
 feature is enabled,
 .Sy lz4 .
 .Pp
 The
 .Sy lz4
 compression algorithm is a high-performance replacement for the
 .Sy lzjb
 algorithm.
 It features significantly faster compression and decompression, as well as a
 moderately higher compression ratio than
 .Sy lzjb ,
 but can only be used on pools with the
 .Sy lz4_compress
 feature set to
 .Sy enabled .
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy lz4_compress
 feature.
 .Pp
 The
 .Sy lzjb
 compression algorithm is optimized for performance while providing decent data
 compression.
 .Pp
 The
 .Sy gzip
 compression algorithm uses the same compression as the
 .Xr gzip 1
 command.
 You can specify the
 .Sy gzip
 level by using the value
 .Sy gzip- Ns Ar N ,
 where
 .Ar N
 is an integer from 1
 .Pq fastest
 to 9
 .Pq best compression ratio .
 Currently,
 .Sy gzip
 is equivalent to
 .Sy gzip-6
 .Po which is also the default for
 .Xr gzip 1
 .Pc .
 .Pp
 The
 .Sy zstd
 compression algorithm provides both high compression ratios and good
 performance.
 You can specify the
 .Sy zstd
 level by using the value
 .Sy zstd- Ns Ar N ,
 where
 .Ar N
 is an integer from 1
 .Pq fastest
 to 19
 .Pq best compression ratio .
 .Sy zstd
 is equivalent to
 .Sy zstd-3 .
 .Pp
 Faster speeds at the cost of the compression ratio can be requested by
 setting a negative
 .Sy zstd
 level.
 This is done using
 .Sy zstd-fast- Ns Ar N ,
 where
 .Ar N
 is an integer in
 .Bq Sy 1 Ns - Ns Sy 10 , 20 , 30 , No … , Sy 100 , 500 , 1000
 which maps to a negative
 .Sy zstd
 level.
 The lower the level the faster the compression \(em
 .Sy 1000
 provides the fastest compression and lowest compression ratio.
 .Sy zstd-fast
 is equivalent to
 .Sy zstd-fast- Ns Ar 1 .
 .Pp
 The
 .Sy zle
 compression algorithm compresses runs of zeros.
 .Pp
 This property can also be referred to by its shortened column name
 .Sy compress .
 Changing this property affects only newly-written data.
 .Pp
 When any setting except
 .Sy off
 is selected, compression will explicitly check for blocks consisting of only
 zeroes (the NUL byte).
 When a zero-filled block is detected, it is stored as
 a hole and not compressed using the indicated compression algorithm.
 .Pp
 All blocks are allocated as a whole number of sectors
 .Pq chunks of 2^ Ns Sy ashift No bytes , e.g . Sy 512B No or Sy 4KB .
 Compression may result in a non-sector-aligned size, which will be rounded up
 to a whole number of sectors.
 If compression saves less than one whole sector,
 the block will be stored uncompressed.
 Therefore, blocks whose logical size is a small number of sectors will
 experience less compression
 (e.g. for
 .Sy recordsize Ns = Ns Sy 16K
 with
 .Sy 4K
 sectors, which have 4 sectors per block,
 compression needs to save at least 25% to actually save space on disk).
 .Pp
 There is
 .Sy 12.5%
 default compression threshold in addition to sector rounding.
 .It Xo
 .Sy context Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux context for all files in the file system under
 a mount point for that file system.
 See
 .Xr selinux 8
 for more information.
 .It Xo
 .Sy fscontext Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux context for the file system file system being
 mounted.
 See
 .Xr selinux 8
 for more information.
 .It Xo
 .Sy defcontext Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux default context for unlabeled files.
 See
 .Xr selinux 8
 for more information.
 .It Xo
 .Sy rootcontext Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux context for the root inode of the file system.
 See
 .Xr selinux 8
 for more information.
 .It Sy copies Ns = Ns Sy 1 Ns | Ns Sy 2 Ns | Ns Sy 3
 Controls the number of copies of data stored for this dataset.
 These copies are in addition to any redundancy provided by the pool, for
 example, mirroring or RAID-Z.
 The copies are stored on different disks, if possible.
 The space used by multiple copies is charged to the associated file and dataset,
 changing the
 .Sy used
 property and counting against quotas and reservations.
 .Pp
 Changing this property only affects newly-written data.
 Therefore, set this property at file system creation time by using the
 .Fl o Sy copies Ns = Ns Ar N
 option.
 .Pp
 Remember that ZFS will not import a pool with a missing top-level vdev.
 Do
 .Em NOT
 create, for example a two-disk striped pool and set
 .Sy copies Ns = Ns Ar 2
 on some datasets thinking you have setup redundancy for them.
 When a disk fails you will not be able to import the pool
 and will have lost all of your data.
 .Pp
 Encrypted datasets may not have
 .Sy copies Ns = Ns Ar 3
 since the implementation stores some encryption metadata where the third copy
 would normally be.
 .It Sy devices Ns = Ns Sy on Ns | Ns Sy off
 Controls whether device nodes can be opened on this file system.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy dev
 and
 .Sy nodev
 mount options.
 .It Xo
 .Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns
 .Sy sha256 Ns Oo , Ns Sy verify Oc Ns | Ns Sy sha512 Ns Oo , Ns Sy verify Oc Ns | Ns Sy skein Ns Oo , Ns Sy verify Oc Ns | Ns
 .Sy edonr , Ns Sy verify Ns | Ns Sy blake3 Ns Oo , Ns Sy verify Oc Ns
 .Xc
 Configures deduplication for a dataset.
 The default value is
 .Sy off .
 The default deduplication checksum is
 .Sy sha256
 (this may change in the future).
 When
 .Sy dedup
 is enabled, the checksum defined here overrides the
 .Sy checksum
 property.
 Setting the value to
 .Sy verify
 has the same effect as the setting
 .Sy sha256 , Ns Sy verify .
 .Pp
 If set to
 .Sy verify ,
 ZFS will do a byte-to-byte comparison in case of two blocks having the same
 signature to make sure the block contents are identical.
 Specifying
 .Sy verify
 is mandatory for the
 .Sy edonr
 algorithm.
 .Pp
 Unless necessary, deduplication should
 .Em not
 be enabled on a system.
 See the
 .Sx Deduplication
 section of
 .Xr zfsconcepts 7 .
 .It Xo
 .Sy direct Ns = Ns Sy disabled Ns | Ns Sy standard Ns | Ns Sy always
 .Xc
 Controls the behavior of Direct I/O requests
 .Pq e.g. Dv O_DIRECT .
 The
 .Sy standard
 behavior for Direct I/O requests is to bypass the ARC when possible.
 These requests will not be cached and performance will be limited by the
 raw speed of the underlying disks
 .Pq Dv this is the default .
 .Sy always
 causes every properly aligned read or write to be treated as a direct request.
 .Sy disabled
 causes the O_DIRECT flag to be silently ignored and all direct requests will
 be handled by the ARC.
 This is the default behavior for OpenZFS 2.2 and prior releases.
 .Pp
 Bypassing the ARC requires that a direct request be correctly aligned.
 For write requests the starting offset and size of the request must be
 .Sy recordsize Ns
 -aligned, if not then the unaligned portion of the request will be silently
 redirected through the ARC.
 For read requests there is no
 .Sy recordsize
 alignment restriction on either the starting offset or size.
 All direct requests must use a page-aligned memory buffer and the request
 size must be a multiple of the page size or an error is returned.
 .Pp
 Concurrently mixing buffered and direct requests to overlapping regions of
 a file can decrease performance.
 However, the resulting file will always be coherent.
 For example, a direct read after a buffered write will return the data
 from the buffered write.
 Furthermore, if an application uses
 .Xr mmap 2
 based file access then in order to maintain coherency all direct requests
 are converted to buffered requests while the file is mapped.
 Currently Direct I/O is not supported with zvols.
 If dedup is enabled on a dataset, Direct I/O writes will not check for
 deduplication.
 Deduplication and Direct I/O writes are currently incompatible.
 .It Xo
 .Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns
 .Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k
 .Xc
 Specifies a compatibility mode or literal value for the size of dnodes in the
 file system.
 The default value is
 .Sy legacy .
 Setting this property to a value other than
 .Sy legacy No requires the Sy large_dnode No pool feature to be enabled .
 .Pp
 Consider setting
 .Sy dnodesize
 to
 .Sy auto
 if the dataset uses the
 .Sy xattr Ns = Ns Sy sa
 property setting and the workload makes heavy use of extended attributes.
 This
 may be applicable to SELinux-enabled systems, Lustre servers, and Samba
 servers, for example.
 Literal values are supported for cases where the optimal
 size is known in advance and for performance testing.
 .Pp
 Leave
 .Sy dnodesize
 set to
 .Sy legacy
 if you need to receive a send stream of this dataset on a pool that doesn't
 enable the
 .Sy large_dnode
 feature, or if you need to import this pool on a system that doesn't support the
 .Sy large_dnode No feature .
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy dnsize .
 .It Xo
 .Sy encryption Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy aes-128-ccm Ns | Ns
 .Sy aes-192-ccm Ns | Ns Sy aes-256-ccm Ns | Ns Sy aes-128-gcm Ns | Ns
 .Sy aes-192-gcm Ns | Ns Sy aes-256-gcm
 .Xc
 Controls the encryption cipher suite (block cipher, key length, and mode) used
 for this dataset.
 Requires the
 .Sy encryption
 feature to be enabled on the pool.
 Requires a
 .Sy keyformat
 to be set at dataset creation time.
 .Pp
 Selecting
 .Sy encryption Ns = Ns Sy on
 when creating a dataset indicates that the default encryption suite will be
 selected, which is currently
 .Sy aes-256-gcm .
 In order to provide consistent data protection, encryption must be specified at
 dataset creation time and it cannot be changed afterwards.
 .Pp
 For more details and caveats about encryption see the
 .Sx Encryption
 section of
 .Xr zfs-load-key 8 .
 .It Sy keyformat Ns = Ns Sy raw Ns | Ns Sy hex Ns | Ns Sy passphrase
 Controls what format the user's encryption key will be provided as.
 This property is only set when the dataset is encrypted.
 .Pp
 Raw keys and hex keys must be 32 bytes long (regardless of the chosen
 encryption suite) and must be randomly generated.
 A raw key can be generated with the following command:
 .Dl # Nm dd Sy if=/dev/urandom bs=32 count=1 Sy of= Ns Pa /path/to/output/key
 .Pp
 Passphrases must be between 8 and 512 bytes long and will be processed through
 PBKDF2 before being used (see the
 .Sy pbkdf2iters
 property).
 Even though the encryption suite cannot be changed after dataset creation,
 the keyformat can be with
 .Nm zfs Cm change-key .
 .It Xo
 .Sy keylocation Ns = Ns Sy prompt Ns | Ns Sy file:// Ns Ar /absolute/file/path Ns | Ns Sy https:// Ns Ar address Ns | Ns Sy http:// Ns Ar address
 .Xc
 Controls where the user's encryption key will be loaded from by default for
 commands such as
 .Nm zfs Cm load-key
 and
 .Nm zfs Cm mount Fl l .
 This property is only set for encrypted datasets which are encryption roots.
 If unspecified, the default is
 .Sy prompt .
 .Pp
 Even though the encryption suite cannot be changed after dataset creation, the
 keylocation can be with either
 .Nm zfs Cm set
 or
 .Nm zfs Cm change-key .
 If
 .Sy prompt
 is selected ZFS will ask for the key at the command prompt when it is required
 to access the encrypted data (see
 .Nm zfs Cm load-key
 for details).
 This setting will also allow the key to be passed in via the standard input
 stream,
 but users should be careful not to place keys which should be kept secret on
 the command line.
 If a file URI is selected, the key will be loaded from the
 specified absolute file path.
 If an HTTPS or HTTP URL is selected, it will be GETted using
 .Xr fetch 3 ,
 libcurl, or nothing, depending on compile-time configuration and run-time
 availability.
 The
 .Sy SSL_CA_CERT_FILE
 environment variable can be set to set the location
 of the concatenated certificate store.
 The
 .Sy SSL_CA_CERT_PATH
 environment variable can be set to override the location
 of the directory containing the certificate authority bundle.
 The
 .Sy SSL_CLIENT_CERT_FILE
 and
 .Sy SSL_CLIENT_KEY_FILE
 environment variables can be set to configure the path
 to the client certificate and its key.
 .It Sy pbkdf2iters Ns = Ns Ar iterations
 Controls the number of PBKDF2 iterations that a
 .Sy passphrase
 encryption key should be run through when processing it into an encryption key.
 This property is only defined when encryption is enabled and a keyformat of
 .Sy passphrase
 is selected.
 The goal of PBKDF2 is to significantly increase the
 computational difficulty needed to brute force a user's passphrase.
 This is accomplished by forcing the attacker to run each passphrase through a
 computationally expensive hashing function many times before they arrive at the
 resulting key.
 A user who actually knows the passphrase will only have to pay this cost once.
 As CPUs become better at processing, this number should be
 raised to ensure that a brute force attack is still not possible.
 The current default is
 .Sy 350000
 and the minimum is
 .Sy 100000 .
 This property may be changed with
 .Nm zfs Cm change-key .
 .It Sy exec Ns = Ns Sy on Ns | Ns Sy off
 Controls whether processes can be executed from within this file system.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy exec
 and
 .Sy noexec
 mount options.
 .It Sy volthreading Ns = Ns Sy on Ns | Ns Sy off
 Controls internal zvol threading.
 The value
 .Sy off
 disables zvol threading, and zvol relies on application threads.
 The default value is
 .Sy on ,
 which enables threading within a zvol.
 Please note that this property will be overridden by
 .Sy zvol_request_sync
 module parameter.
 This property is only applicable to Linux.
 .It Sy filesystem_limit Ns = Ns Ar count Ns | Ns Sy none
 Limits the number of filesystems and volumes that can exist under this point in
 the dataset tree.
 The limit is not enforced if the user is allowed to change the limit.
 Setting a
 .Sy filesystem_limit
 to
 .Sy on
 a descendant of a filesystem that already has a
 .Sy filesystem_limit
 does not override the ancestor's
 .Sy filesystem_limit ,
 but rather imposes an additional limit.
 This feature must be enabled to be used
 .Po see
 .Xr zpool-features 7
 .Pc .
 .It Sy special_small_blocks Ns = Ns Ar size
 This value represents the threshold block size for including small file
 blocks into the special allocation class.
 Blocks smaller than or equal to this
 value will be assigned to the special allocation class while greater blocks
 will be assigned to the regular class.
 Valid values are zero or a power of two from 512 up to 1048576 (1 MiB).
 The default size is 0 which means no small file blocks
 will be allocated in the special class.
 .Pp
 Before setting this property, a special class vdev must be added to the
 pool.
 See
 .Xr zpoolconcepts 7
 for more details on the special allocation class.
 .It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy
 Controls the mount point used for this file system.
 See the
 .Sx Mount Points
 section of
 .Xr zfsconcepts 7
 for more information on how this property is used.
 .Pp
 When the
 .Sy mountpoint
 property is changed for a file system, the file system and any children that
 inherit the mount point are unmounted.
 If the new value is
 .Sy legacy ,
 then they remain unmounted.
 Otherwise, they are automatically remounted in the new location if the property
 was previously
 .Sy legacy
 or
 .Sy none .
 In addition, any shared file systems are unshared and shared in the new
 location.
 .Pp
 When the
 .Sy mountpoint
 property is set with
 .Nm zfs Cm set Fl u
 , the
 .Sy mountpoint
 property is updated but dataset is not mounted or unmounted and remains
 as it was before.
 .It Sy nbmand Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the file system should be mounted with
 .Sy nbmand
 .Pq Non-blocking mandatory locks .
 Changes to this property only take effect when the file system is unmounted and
 remounted.
 This was only supported by Linux prior to 5.15, and was buggy there,
 and is not supported by
 .Fx .
 On Solaris it's used for SMB clients.
 .It Sy overlay Ns = Ns Sy on Ns | Ns Sy off
 Allow mounting on a busy directory or a directory which already contains
 files or directories.
 This is the default mount behavior for Linux and
 .Fx
 file systems.
 On these platforms the property is
 .Sy on
 by default.
 Set to
 .Sy off
 to disable overlay mounts for consistency with OpenZFS on other platforms.
 .It Sy primarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
 Controls what is cached in the primary cache
 .Pq ARC .
 If this property is set to
 .Sy all ,
 then both user data and metadata is cached.
 If this property is set to
 .Sy none ,
 then neither user data nor metadata is cached.
 If this property is set to
 .Sy metadata ,
 then only metadata is cached.
 The default value is
 .Sy all .
 .It Sy quota Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space a dataset and its descendants can consume.
 This property enforces a hard limit on the amount of space used.
 This includes all space consumed by descendants, including file systems and
 snapshots.
 Setting a quota on a descendant of a dataset that already has a quota does not
 override the ancestor's quota, but rather imposes an additional limit.
 .Pp
 Quotas cannot be set on volumes, as the
 .Sy volsize
 property acts as an implicit quota.
 .It Sy snapshot_limit Ns = Ns Ar count Ns | Ns Sy none
 Limits the number of snapshots that can be created on a dataset and its
 descendants.
 Setting a
 .Sy snapshot_limit
 on a descendant of a dataset that already has a
 .Sy snapshot_limit
 does not override the ancestor's
 .Sy snapshot_limit ,
 but rather imposes an additional limit.
 The limit is not enforced if the user is allowed to change the limit.
 For example, this means that recursive snapshots taken from the global zone are
 counted against each delegated dataset within a zone.
 This feature must be enabled to be used
 .Po see
 .Xr zpool-features 7
 .Pc .
 .It Sy userquota@ Ns Ar user Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space consumed by the specified user.
 User space consumption is identified by the
 .Sy userspace@ Ns Ar user
 property.
 .Pp
 Enforcement of user quotas may be delayed by several seconds.
 This delay means that a user might exceed their quota before the system notices
 that they are over quota and begins to refuse additional writes with the
 .Er EDQUOT
 error message.
 See the
 .Nm zfs Cm userspace
 command for more information.
 .Pp
 Unprivileged users can only access their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy userquota
 privilege with
 .Nm zfs Cm allow ,
 can get and set everyone's quota.
 .Pp
 This property is not available on volumes, on file systems before version 4, or
 on pools before version 15.
 The
 .Sy userquota@ Ns Ar …
 properties are not displayed by
 .Nm zfs Cm get Sy all .
 The user's name must be appended after the
 .Sy @
 symbol, using one of the following forms:
 .Bl -bullet -compact -offset 4n
 .It
 POSIX name
 .Pq Qq joe
 .It
 POSIX numeric ID
 .Pq Qq 789
 .It
 SID name
 .Pq Qq joe.smith@mydomain
 .It
 SID numeric ID
 .Pq Qq S-1-123-456-789
 .El
 .Pp
 Files created on Linux always have POSIX owners.
 .It Sy defaultuserquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default user quota to be applied to each user for whom no
 user-specific quota is set.
 The value
 .Sy 0
 disables defaultuserquota.
 .It Sy userobjquota@ Ns Ar user Ns = Ns Ar size Ns | Ns Sy none
 The
 .Sy userobjquota
 is similar to
 .Sy userquota
 but it limits the number of objects a user can create.
 Please refer to
 .Sy userobjused
 for more information about how objects are counted.
 .It Sy defaultuserobjquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default user object quota to be applied to each user for
 whom no userobj-specific quota is set.
 The value
 .Sy 0
 disables defaultuserobjquota.
 .It Sy groupquota@ Ns Ar group Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space consumed by the specified group.
 Group space consumption is identified by the
 .Sy groupused@ Ns Ar group
 property.
 .Pp
 Unprivileged users can access only their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy groupquota
 privilege with
 .Nm zfs Cm allow ,
 can get and set all groups' quotas.
 .It Sy defaultgroupquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default group quota to be applied to each group for whom no
 group-specific quota is set.
 The value
 .Sy 0
 disables defaultgroupquota.
 .It Sy groupobjquota@ Ns Ar group Ns = Ns Ar size Ns | Ns Sy none
 The
 .Sy groupobjquota
 is similar to
 .Sy groupquota
 but it limits number of objects a group can consume.
 Please refer to
 .Sy userobjused
 for more information about how objects are counted.
 .It Sy defaultgroupobjquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default group object quota to be applied to each group for
 whom no groupobj-specific quota is set.
 The value
 .Sy 0
 disables defaultgroupobjquota.
 .It Sy projectquota@ Ns Ar project Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space consumed by the specified project.
 Project space consumption is identified by the
 .Sy projectused@ Ns Ar project
 property.
 Please refer to
 .Sy projectused
 for more information about how project is identified and set/changed.
 .Pp
 The root user, or a user who has been granted the
 .Sy projectquota
 privilege with
 .Nm zfs allow ,
 can access all projects' quota.
 .It Sy defaultprojectquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default project quota to be applied to each project for whom no
 project-specific quota is set.
 The value
 .Sy 0
 disables defaultprojectquota.
 .It Sy projectobjquota@ Ns Ar project Ns = Ns Ar size Ns | Ns Sy none
 The
 .Sy projectobjquota
 is similar to
 .Sy projectquota
 but it limits number of objects a project can consume.
 Please refer to
 .Sy userobjused
 for more information about how objects are counted.
 .It Sy defaultprojectobjquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default project object quota to be applied to each project for
 whom no projectobj-specific quota is set.
 The value
 .Sy 0
 disables defaultprojectobjquota.
 .It Sy readonly Ns = Ns Sy on Ns | Ns Sy off
 Controls whether this dataset can be modified.
 The default value is
 .Sy off .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy ro
 and
 .Sy rw
 mount options.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy rdonly .
 .It Sy recordsize Ns = Ns Ar size
 Specifies a suggested block size for files in the file system.
 This property is designed solely for use with database workloads that access
 files in fixed-size records.
 ZFS automatically tunes block sizes according to internal algorithms optimized
 for typical access patterns.
 .Pp
 For databases that create very large files but access them in small random
 chunks, these algorithms may be suboptimal.
 Specifying a
 .Sy recordsize
 greater than or equal to the record size of the database can result in
 significant performance gains.
 Use of this property for general purpose file systems is strongly discouraged,
 and may adversely affect performance.
 .Pp
 The size specified must be a power of two greater than or equal to
 .Ar 512 B
 and less than or equal to
 .Ar 128 KiB .
 If the
 .Sy large_blocks
 feature is enabled on the pool, the size may be up to
 .Ar 16 MiB .
 See
 .Xr zpool-features 7
 for details on ZFS feature flags.
 .Pp
 However, blocks larger than
 .Ar 1 MiB
 can have an impact on i/o latency (e.g. tying up a spinning disk for
 ~300ms), and also potentially on the memory allocator.
 .Pp
 Note that maximum size is still limited by default to
 .Ar 1 MiB
 on x86_32, see
 .Sy zfs_max_recordsize
 module parameter.
 .Pp
 Changing the file system's
 .Sy recordsize
 affects only files created afterward; existing files are unaffected.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy recsize .
 .It Sy redundant_metadata Ns = Ns Sy all Ns | Ns Sy most Ns | Ns Sy some Ns | Ns Sy none
 Controls what types of metadata are stored redundantly.
 ZFS stores an extra copy of metadata, so that if a single block is corrupted,
 the amount of user data lost is limited.
 This extra copy is in addition to any redundancy provided at the pool level
 .Pq e.g. by mirroring or RAID-Z ,
 and is in addition to an extra copy specified by the
 .Sy copies
 property
 .Pq up to a total of 3 copies .
 For example if the pool is mirrored,
 .Sy copies Ns = Ns 2 ,
 and
 .Sy redundant_metadata Ns = Ns Sy most ,
 then ZFS stores 6 copies of most metadata, and 4 copies of data and some
 metadata.
 .Pp
 When set to
 .Sy all ,
 ZFS stores an extra copy of all metadata.
 If a single on-disk block is corrupt, at worst a single block of user data
 .Po which is
 .Sy recordsize
 bytes long
 .Pc
 can be lost.
 .Pp
 When set to
 .Sy most ,
 ZFS stores an extra copy of most types of metadata.
 This can improve performance of random writes, because less metadata must be
 written.
 In practice, at worst about 1000 blocks
 .Po of
 .Sy recordsize
 bytes each
 .Pc
 of user data can be lost if a single on-disk block is corrupt.
 The exact behavior of which metadata blocks are stored redundantly may change in
 future releases.
 .Pp
 When set to
 .Sy some ,
 ZFS stores an extra copy of only critical metadata.
 This can improve file create performance since less metadata
 needs to be written.
-If a single on-disk block is corrupt, at worst a single user file can be lost.
+If a single on-disk block is corrupt, multiple user files or directories
+can be lost.
 .Pp
 When set to
 .Sy none ,
 ZFS does not store any copies of metadata redundantly.
 If a single on-disk block is corrupt, an entire dataset can be lost.
 .Pp
 The default value is
 .Sy all .
 .It Sy refquota Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space a dataset can consume.
 This property enforces a hard limit on the amount of space used.
 This hard limit does not include space used by descendants, including file
 systems and snapshots.
 .It Sy refreservation Ns = Ns Ar size Ns | Ns Sy none Ns | Ns Sy auto
 The minimum amount of space guaranteed to a dataset, not including its
 descendants.
 When the amount of space used is below this value, the dataset is treated as if
 it were taking up the amount of space specified by
 .Sy refreservation .
 The
 .Sy refreservation
 reservation is accounted for in the parent datasets' space used, and counts
 against the parent datasets' quotas and reservations.
 .Pp
 If
 .Sy refreservation
 is set, a snapshot is only allowed if there is enough free pool space outside of
 this reservation to accommodate the current number of
 .Qq referenced
 bytes in the dataset.
 .Pp
 If
 .Sy refreservation
 is set to
 .Sy auto ,
 a volume is thick provisioned
 .Po or
 .Qq not sparse
 .Pc .
 .Sy refreservation Ns = Ns Sy auto
 is only supported on volumes.
 See
 .Sy volsize
 in the
 .Sx Native Properties
 section for more information about sparse volumes.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy refreserv .
 .It Sy relatime Ns = Ns Sy on Ns | Ns Sy off
 Controls the manner in which the access time is updated when
 .Sy atime Ns = Ns Sy on
 is set.
 Turning this property on causes the access time to be updated relative
 to the modify or change time.
 Access time is only updated if the previous
 access time was earlier than the current modify or change time or if the
 existing access time hasn't been updated within the past 24 hours.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy relatime
 and
 .Sy norelatime
 mount options.
 .It Sy reservation Ns = Ns Ar size Ns | Ns Sy none
 The minimum amount of space guaranteed to a dataset and its descendants.
 When the amount of space used is below this value, the dataset is treated as if
 it were taking up the amount of space specified by its reservation.
 Reservations are accounted for in the parent datasets' space used, and count
 against the parent datasets' quotas and reservations.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy reserv .
 .It Sy secondarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
 Controls what is cached in the secondary cache
 .Pq L2ARC .
 If this property is set to
 .Sy all ,
 then both user data and metadata is cached.
 If this property is set to
 .Sy none ,
 then neither user data nor metadata is cached.
 If this property is set to
 .Sy metadata ,
 then only metadata is cached.
 The default value is
 .Sy all .
 .It Sy prefetch Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
 Controls what speculative prefetch does.
 If this property is set to
 .Sy all ,
 then both user data and metadata are prefetched.
 If this property is set to
 .Sy none ,
 then neither user data nor metadata are prefetched.
 If this property is set to
 .Sy metadata ,
 then only metadata are prefetched.
 The default value is
 .Sy all .
 .Pp
 Please note that the module parameter zfs_prefetch_disable=1 can
 be used to totally disable speculative prefetch, bypassing anything
 this property does.
 .It Sy setuid Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the setuid bit is respected for the file system.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy suid
 and
 .Sy nosuid
 mount options.
 .It Sy sharesmb Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts
 Controls whether the file system is shared by using
 .Sy Samba USERSHARES
 and what options are to be used.
 Otherwise, the file system is automatically shared and unshared with the
 .Nm zfs Cm share
 and
 .Nm zfs Cm unshare
 commands.
 If the property is set to on, the
 .Xr net 8
 command is invoked to create a
 .Sy USERSHARE .
 .Pp
 Because SMB shares requires a resource name, a unique resource name is
 constructed from the dataset name.
 The constructed name is a copy of the
 dataset name except that the characters in the dataset name, which would be
 invalid in the resource name, are replaced with underscore (_) characters.
 Linux does not currently support additional options which might be available
 on Solaris.
 .Pp
 If the
 .Sy sharesmb
 property is set to
 .Sy off ,
 the file systems are unshared.
 .Pp
 The share is created with the ACL (Access Control List) "Everyone:F" ("F"
 stands for "full permissions", i.e. read and write permissions) and no guest
 access (which means Samba must be able to authenticate a real user \(em
 .Xr passwd 5 Ns / Ns Xr shadow 5 Ns - ,
 LDAP- or
 .Xr smbpasswd 5 Ns -based )
 by default.
 This means that any additional access control
 (disallow specific user specific access etc) must be done on the underlying file
 system.
 .Pp
 When the
 .Sy sharesmb
 property is updated with
 .Nm zfs Cm set Fl u
 , the property is set to desired value, but the operation to share, reshare
 or unshare the the dataset is not performed.
 .It Sy sharenfs Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts
 Controls whether the file system is shared via NFS, and what options are to be
 used.
 A file system with a
 .Sy sharenfs
 property of
 .Sy off
 is managed with the
 .Xr exportfs 8
 command and entries in the
 .Pa /etc/exports
 file.
 Otherwise, the file system is automatically shared and unshared with the
 .Nm zfs Cm share
 and
 .Nm zfs Cm unshare
 commands.
 If the property is set to
 .Sy on ,
 the dataset is shared using the default options:
 .Dl sec=sys,rw,crossmnt,no_subtree_check
 .Pp
 Please note that the options are comma-separated, unlike those found in
 .Xr exports 5 .
 This is done to negate the need for quoting, as well as to make parsing
 with scripts easier.
 .Pp
 For
 .Fx ,
 there may be multiple sets of options separated by semicolon(s).
 Each set of options must apply to different hosts or networks and each
 set of options will create a separate line for
 .Xr exports 5 .
 Any semicolon separated option set that consists entirely of whitespace
 will be ignored.
 This use of semicolons is only for
 .Fx
 at this time.
 .Pp
 See
 .Xr exports 5
 for the meaning of the default options.
 Otherwise, the
 .Xr exportfs 8
 command is invoked with options equivalent to the contents of this property.
 .Pp
 When the
 .Sy sharenfs
 property is changed for a dataset, the dataset and any children inheriting the
 property are re-shared with the new options, only if the property was previously
 .Sy off ,
 or if they were shared before the property was changed.
 If the new property is
 .Sy off ,
 the file systems are unshared.
 .Pp
 When the
 .Sy sharenfs
 property is updated with
 .Nm zfs Cm set Fl u
 , the property is set to desired value, but the operation to share, reshare
 or unshare the the dataset is not performed.
 .It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput
 Provide a hint to ZFS about handling of synchronous requests in this dataset.
 If
 .Sy logbias
 is set to
 .Sy latency
 .Pq the default ,
 ZFS will use pool log devices
 .Pq if configured
 to handle the requests at low latency.
 If
 .Sy logbias
 is set to
 .Sy throughput ,
 ZFS will not use configured pool log devices.
 ZFS will instead optimize synchronous operations for global pool throughput and
 efficient use of resources.
 .It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible
 Controls whether the volume snapshot devices under
 .Pa /dev/zvol/ Ns Aq Ar pool
 are hidden or visible.
 The default value is
 .Sy hidden .
 .It Sy snapdir Ns = Ns Sy disabled Ns | Ns Sy hidden Ns | Ns Sy visible
 Controls whether the
 .Pa .zfs
 directory is disabled, hidden or visible in the root of the file system as
 discussed in the
 .Sx Snapshots
 section of
 .Xr zfsconcepts 7 .
 The default value is
 .Sy hidden .
 .It Sy sync Ns = Ns Sy standard Ns | Ns Sy always Ns | Ns Sy disabled
 Controls the behavior of synchronous requests
 .Pq e.g. fsync, O_DSYNC .
 .Sy standard
 is the POSIX-specified behavior of ensuring all synchronous requests
 are written to stable storage and all devices are flushed to ensure
 data is not cached by device controllers
 .Pq this is the default .
 .Sy always
 causes every file system transaction to be written and flushed before its
 system call returns.
 This has a large performance penalty.
 .Sy disabled
 disables synchronous requests.
 File system transactions are only committed to stable storage periodically.
 This option will give the highest performance.
 However, it is very dangerous as ZFS would be ignoring the synchronous
 transaction demands of applications such as databases or NFS.
 Administrators should only use this option when the risks are understood.
 .It Sy version Ns = Ns Ar N Ns | Ns Sy current
 The on-disk version of this file system, which is independent of the pool
 version.
 This property can only be set to later supported versions.
 See the
 .Nm zfs Cm upgrade
 command.
 .It Sy volsize Ns = Ns Ar size
 For volumes, specifies the logical size of the volume.
 By default, creating a volume establishes a reservation of equal size.
 For storage pools with a version number of 9 or higher, a
 .Sy refreservation
 is set instead.
 Any changes to
 .Sy volsize
 are reflected in an equivalent change to the reservation
 .Pq or Sy refreservation .
 The
 .Sy volsize
 can only be set to a multiple of
 .Sy volblocksize ,
 and cannot be zero.
 .Pp
 The reservation is kept equal to the volume's logical size to prevent unexpected
 behavior for consumers.
 Without the reservation, the volume could run out of space, resulting in
 undefined behavior or data corruption, depending on how the volume is used.
 These effects can also occur when the volume size is changed while it is in use
 .Pq particularly when shrinking the size .
 Extreme care should be used when adjusting the volume size.
 .Pp
 Though not recommended, a
 .Qq sparse volume
 .Po also known as
 .Qq thin provisioned
 .Pc
 can be created by specifying the
 .Fl s
 option to the
 .Nm zfs Cm create Fl V
 command, or by changing the value of the
 .Sy refreservation
 property
 .Po or
 .Sy reservation
 property on pool version 8 or earlier
 .Pc
 after the volume has been created.
 A
 .Qq sparse volume
 is a volume where the value of
 .Sy refreservation
 is less than the size of the volume plus the space required to store its
 metadata.
 Consequently, writes to a sparse volume can fail with
 .Er ENOSPC
 when the pool is low on space.
 For a sparse volume, changes to
 .Sy volsize
 are not reflected in the
 .Sy refreservation .
 A volume that is not sparse is said to be
 .Qq thick provisioned .
 A sparse volume can become thick provisioned by setting
 .Sy refreservation
 to
 .Sy auto .
 .It Sy volmode Ns = Ns Sy default Ns | Ns Sy full Ns | Ns Sy geom Ns | Ns Sy dev Ns | Ns Sy none
 This property specifies how volumes should be exposed to the OS.
 Setting it to
 .Sy full
 exposes volumes as fully fledged block devices, providing maximal
 functionality.
 The value
 .Sy geom
 is just an alias for
 .Sy full
 and is kept for compatibility.
 Setting it to
 .Sy dev
 hides its partitions.
 Volumes with property set to
 .Sy none
 are not exposed outside ZFS, but can be snapshotted, cloned, replicated, etc,
 that can be suitable for backup purposes.
 Value
 .Sy default
 means that volumes exposition is controlled by system-wide tunable
 .Sy zvol_volmode ,
 where
 .Sy full ,
 .Sy dev
 and
 .Sy none
 are encoded as 1, 2 and 3 respectively.
 The default value is
 .Sy full .
 .It Sy vscan Ns = Ns Sy on Ns | Ns Sy off
 Controls whether regular files should be scanned for viruses when a file is
 opened and closed.
 In addition to enabling this property, the virus scan service must also be
 enabled for virus scanning to occur.
 The default value is
 .Sy off .
 This property is not used by OpenZFS.
 .It Sy xattr Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy dir Ns | Ns Sy sa
 Controls whether extended attributes are enabled for this file system.
 Two styles of extended attributes are supported: either directory-based
 or system-attribute-based.
 .Pp
 Directory-based extended attributes can be enabled by setting the value to
 .Sy dir .
 This style of extended attribute imposes no practical limit
 on either the size or number of attributes which can be set on a file.
 Although under Linux the
 .Xr getxattr 2
 and
 .Xr setxattr 2
 system calls limit the maximum size to
 .Sy 64K .
 This is the most compatible
 style of extended attribute and is supported by all ZFS implementations.
 .Pp
 System-attribute-based xattrs can be enabled by setting the value to
 .Sy sa
 (default and equal to
 .Sy on
 ) .
 The key advantage of this type of xattr is improved performance.
 Storing extended attributes as system attributes
 significantly decreases the amount of disk I/O required.
 Up to
 .Sy 64K
 of data may be stored per-file in the space reserved for system attributes.
 If there is not enough space available for an extended attribute
 then it will be automatically written as a directory-based xattr.
 System-attribute-based extended attributes are not accessible
 on platforms which do not support the
 .Sy xattr Ns = Ns Sy sa
 feature.
 OpenZFS supports
 .Sy xattr Ns = Ns Sy sa
 on both
 .Fx
 and Linux.
 .Pp
 The use of system-attribute-based xattrs is strongly encouraged for users of
 SELinux or POSIX ACLs.
 Both of these features heavily rely on extended
 attributes and benefit significantly from the reduced access time.
 .Pp
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy xattr
 and
 .Sy noxattr
 mount options.
 .It Sy jailed Ns = Ns Sy off Ns | Ns Sy on
 Controls whether the dataset is managed from a jail.
 See
 .Xr zfs-jail 8
 for more information.
 Jails are a
 .Fx
 feature and this property is not available on other platforms.
 .It Sy zoned Ns = Ns Sy off Ns | Ns Sy on
 Controls whether the dataset is managed from a non-global zone or namespace.
 See
 .Xr zfs-zone 8
 for more information.
 Zoning is a
 Linux
 feature and this property is not available on other platforms.
 .El
 .Pp
 The following three properties cannot be changed after the file system is
 created, and therefore, should be set when the file system is created.
 If the properties are not set with the
 .Nm zfs Cm create
 or
 .Nm zpool Cm create
 commands, these properties are inherited from the parent dataset.
 If the parent dataset lacks these properties due to having been created prior to
 these features being supported, the new file system will have the default values
 for these properties.
 .Bl -tag -width ""
 .It Xo
 .Sy casesensitivity Ns = Ns Sy sensitive Ns | Ns
 .Sy insensitive Ns | Ns Sy mixed
 .Xc
 Indicates whether the file name matching algorithm used by the file system
 should be case-sensitive, case-insensitive, or allow a combination of both
 styles of matching.
 The default value for the
 .Sy casesensitivity
 property is
 .Sy sensitive .
 Traditionally,
 .Ux
 and POSIX file systems have case-sensitive file names.
 .Pp
 The
 .Sy mixed
 value for the
 .Sy casesensitivity
 property indicates that the file system can support requests for both
 case-sensitive and case-insensitive matching behavior.
 Currently, case-insensitive matching behavior on a file system that supports
 mixed behavior is limited to the SMB server product.
 For more information about the
 .Sy mixed
 value behavior, see the "ZFS Administration Guide".
 .It Xo
 .Sy normalization Ns = Ns Sy none Ns | Ns Sy formC Ns | Ns
 .Sy formD Ns | Ns Sy formKC Ns | Ns Sy formKD
 .Xc
 Indicates whether the file system should perform a
 .Sy Unicode
 normalization of file names whenever two file names are compared, and which
 normalization algorithm should be used.
 File names are always stored unmodified, names are normalized as part of any
 comparison process.
 If this property is set to a legal value other than
 .Sy none ,
 and the
 .Sy utf8only
 property was left unspecified, the
 .Sy utf8only
 property is automatically set to
 .Sy on .
 The default value of the
 .Sy normalization
 property is
 .Sy none .
 This property cannot be changed after the file system is created.
 .It Sy utf8only Ns = Ns Sy on Ns | Ns Sy off
 Indicates whether the file system should reject file names that include
 characters that are not present in the
 .Sy UTF-8
 character code set.
 If this property is explicitly set to
 .Sy off ,
 the normalization property must either not be explicitly set or be set to
 .Sy none .
 The default value for the
 .Sy utf8only
 property is
 .Sy off .
 This property cannot be changed after the file system is created.
 .El
 .Pp
 The
 .Sy casesensitivity ,
 .Sy normalization ,
 and
 .Sy utf8only
 properties are also new permissions that can be assigned to non-privileged users
 by using the ZFS delegated administration feature.
 .
 .Ss Temporary Mount Point Properties
 When a file system is mounted, either through
 .Xr mount 8
 for legacy mounts or the
 .Nm zfs Cm mount
 command for normal file systems, its mount options are set according to its
 properties.
 The correlation between properties and mount options is as follows:
 .Bl -tag -compact -offset Ds -width "rootcontext="
 .It Sy atime
 atime/noatime
 .It Sy canmount
 auto/noauto
 .It Sy devices
 dev/nodev
 .It Sy exec
 exec/noexec
 .It Sy readonly
 ro/rw
 .It Sy relatime
 relatime/norelatime
 .It Sy setuid
 suid/nosuid
 .It Sy xattr
 xattr/noxattr
 .It Sy nbmand
 mand/nomand
 .It Sy context Ns =
 context=
 .It Sy fscontext Ns =
 fscontext=
 .It Sy defcontext Ns =
 defcontext=
 .It Sy rootcontext Ns =
 rootcontext=
 .El
 .Pp
 In addition, these options can be set on a per-mount basis using the
 .Fl o
 option, without affecting the property that is stored on disk.
 The values specified on the command line override the values stored in the
 dataset.
 The
 .Sy nosuid
 option is an alias for
 .Sy nodevices , Ns Sy nosetuid .
 These properties are reported as
 .Qq temporary
 by the
 .Nm zfs Cm get
 command.
 If the properties are changed while the dataset is mounted, the new setting
 overrides any temporary settings.
 .
 .Ss User Properties
 In addition to the standard native properties, ZFS supports arbitrary user
 properties.
 User properties have no effect on ZFS behavior, but applications or
 administrators can use them to annotate datasets
 .Pq file systems, volumes, and snapshots .
 .Pp
 User property names must contain a colon
 .Pq Qq Sy \&:
 character to distinguish them from native properties.
 They may contain lowercase letters, numbers, and the following punctuation
 characters: colon
 .Pq Qq Sy \&: ,
 dash
 .Pq Qq Sy - ,
 period
 .Pq Qq Sy \&. ,
 and underscore
 .Pq Qq Sy _ .
 The expected convention is that the property name is divided into two portions
 such as
 .Ar module : Ns Ar property ,
 but this namespace is not enforced by ZFS.
 User property names can be at most 256 characters, and cannot begin with a dash
 .Pq Qq Sy - .
 .Pp
 When making programmatic use of user properties, it is strongly suggested to use
 a reversed DNS domain name for the
 .Ar module
 component of property names to reduce the chance that two
 independently-developed packages use the same property name for different
 purposes.
 .Pp
 The values of user properties are arbitrary strings, are always inherited, and
 are never validated.
 All of the commands that operate on properties
 .Po Nm zfs Cm list ,
 .Nm zfs Cm get ,
 .Nm zfs Cm set ,
 and so forth
 .Pc
 can be used to manipulate both native properties and user properties.
 Use the
 .Nm zfs Cm inherit
 command to clear a user property.
 If the property is not defined in any parent dataset, it is removed entirely.
 Property values are limited to 8192 bytes.
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index b3cea3fbf299..131b1d65dde6 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1,2958 +1,2958 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, 2023, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/trace_zfs.h>
 #include <sys/zfs_racct.h>
 #include <sys/zfs_rlock.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 static int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied L1 blocks from frees allowed into
  * one TXG. After this threshold is crossed, additional dirty blocks from frees
  * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 static uint_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * Enable/disable forcing txg sync when dirty checking for holes with lseek().
  * By default this is enabled to ensure accurate hole reporting, it can result
  * in a significant performance penalty for lseek(SEEK_HOLE) heavy workloads.
  * Disabling this option will result in holes never being reported in dirty
  * files which is always safe.
  */
 static int zfs_dmu_offset_next_sync = 1;
 
 /*
  * Limit the amount we can prefetch with one call to this amount.  This
  * helps to limit the amount of memory that can be used by prefetching.
  * Larger objects should be prefetched a bit at a time.
  */
 #ifdef _ILP32
 uint_t dmu_prefetch_max = 8 * 1024 * 1024;
 #else
 uint_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE;
 #endif
 
 /*
  * Override copies= for dedup state objects. 0 means the traditional behaviour
  * (ie the default for the containing objset ie 3 for the MOS).
  */
 uint_t dmu_ddt_copies = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "unallocated"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "object directory"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "object array"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "packed nvlist"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "packed nvlist size"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj"			},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj header"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map header"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA space map"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, TRUE,  "ZIL intent log"	},
 	{DMU_BSWAP_DNODE,  TRUE,  FALSE, TRUE,  "DMU dnode"		},
 	{DMU_BSWAP_OBJSET, TRUE,  TRUE,  FALSE, "DMU objset"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL directory child map"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset snap map"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL props"		},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL dataset"		},
 	{DMU_BSWAP_ZNODE,  TRUE,  FALSE, FALSE, "ZFS znode"		},
 	{DMU_BSWAP_OLDACL, TRUE,  FALSE, TRUE,  "ZFS V0 ACL"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "ZFS plain file"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS directory"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "ZFS master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS delete queue"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "zvol object"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "zvol prop"		},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "other uint8[]"		},
 	{DMU_BSWAP_UINT64, FALSE, FALSE, TRUE,  "other uint64[]"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "other ZAP"		},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "persistent error log"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, FALSE, "SPA history"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "SPA history offsets"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "Pool properties"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL permissions"	},
 	{DMU_BSWAP_ACL,    TRUE,  FALSE, TRUE,  "ZFS ACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "ZFS SYSACL"		},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,  "FUID table"		},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "FUID table size"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dataset next clones"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan work queue"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project used" },
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,  "ZFS user/group/project quota"},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "snapshot refcount tags"},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT ZAP algorithm"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "DDT statistics"	},
 	{DMU_BSWAP_UINT8,  TRUE,  FALSE, TRUE,	"System attributes"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA master node"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr registration"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, TRUE,	"SA attr layouts"	},
 	{DMU_BSWAP_ZAP,    TRUE,  FALSE, FALSE, "scan translations"	},
 	{DMU_BSWAP_UINT8,  FALSE, FALSE, TRUE,  "deduplicated block"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL deadlist map"	},
 	{DMU_BSWAP_UINT64, TRUE,  TRUE,  FALSE, "DSL deadlist map hdr"	},
 	{DMU_BSWAP_ZAP,    TRUE,  TRUE,  FALSE, "DSL dir clones"	},
 	{DMU_BSWAP_UINT64, TRUE,  FALSE, FALSE, "bpobj subobj"		}
 };
 
 dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
 {
 	int err;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     const void *tag, dmu_buf_t **dbp, dmu_flags_t flags)
 {
 	int err;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_OLD_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	if (newsize < 0 || newsize > db_fake->db_size)
 		return (SET_ERROR(EINVAL));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	if (!DMU_OT_IS_VALID(type))
 		return (SET_ERROR(EINVAL));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * Lookup and hold the bonus buffer for the provided dnode.  If the dnode
  * has not yet been allocated a new bonus dbuf a will be allocated.
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold_by_dnode(dnode_t *dn, const void *tag, dmu_buf_t **dbp,
     dmu_flags_t flags)
 {
 	dmu_buf_impl_t *db;
 	int error;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		if (!rw_tryupgrade(&dn->dn_struct_rwlock)) {
 			rw_exit(&dn->dn_struct_rwlock);
 			rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		}
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (zfs_refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	error = dbuf_read(db, NULL, flags | DB_RF_CANFAIL);
 	if (error) {
 		dnode_evict_bonus(dn);
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 		return (error);
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, const void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH);
 	dnode_rele(dn, FTAG);
 
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, dmu_flags_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else {
 		dbuf_rele(db, tag);
 		*dbp = NULL;
 	}
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, const void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, dmu_flags_t flags, const void *tag,
     dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_spill_hold_by_dnode(DB_DNODE(db), flags, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, const void *tag, int *numbufsp, dmu_buf_t ***dbpp,
     dmu_flags_t flags)
 {
 	dmu_buf_t **dbp;
 	zstream_t *zs = NULL;
 	uint64_t blkid, nblks, i;
 	dmu_flags_t dbuf_flags;
 	int err;
 	zio_t *zio = NULL;
 	boolean_t missed = B_FALSE;
 
 	ASSERT(!read || length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = (flags & ~DMU_READ_PREFETCH) | DMU_READ_NO_PREFETCH |
 	    DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN_TYPED(offset, 1ULL << blkshift, uint64_t))
 		    >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	if (read)
 		zio = zio_root(dn->dn_objset->os_spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	if ((flags & DMU_READ_NO_PREFETCH) == 0) {
 		/*
 		 * Prepare the zfetch before initiating the demand reads, so
 		 * that if multiple threads block on same indirect block, we
 		 * base predictions on the original less racy request order.
 		 */
 		zs = dmu_zfetch_prepare(&dn->dn_zfetch, blkid, nblks,
 		    read && !(flags & DMU_DIRECTIO), B_TRUE);
 	}
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			if (zs) {
 				dmu_zfetch_run(&dn->dn_zfetch, zs, missed,
 				    B_TRUE, (flags & DMU_UNCACHEDIO));
 			}
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			if (read)
 				zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/*
 		 * Initiate async demand data read.
 		 * We check the db_state after calling dbuf_read() because
 		 * (1) dbuf_read() may change the state to CACHED due to a
 		 * hit in the ARC, and (2) on a cache miss, a child will
 		 * have been added to "zio" but not yet completed, so the
 		 * state will not yet be CACHED.
 		 */
 		if (read) {
 			if (i == nblks - 1 && blkid + i < dn->dn_maxblkid &&
 			    offset + length < db->db.db_offset +
 			    db->db.db_size) {
 				if (offset <= db->db.db_offset)
 					dbuf_flags |= DMU_PARTIAL_FIRST;
 				else
 					dbuf_flags |= DMU_PARTIAL_MORE;
 			}
 			(void) dbuf_read(db, zio, dbuf_flags);
 			if (db->db_state != DB_CACHED)
 				missed = B_TRUE;
 		}
 		dbp[i] = &db->db;
 	}
 
 	/*
 	 * If we are doing O_DIRECT we still hold the dbufs, even for reads,
 	 * but we do not issue any reads here. We do not want to account for
 	 * writes in this case.
 	 *
 	 * O_DIRECT write/read accounting takes place in
 	 * dmu_{write/read}_abd().
 	 */
 	if (!read && ((flags & DMU_DIRECTIO) == 0))
 		zfs_racct_write(dn->dn_objset->os_spa, length, nblks, flags);
 
 	if (zs) {
 		dmu_zfetch_run(&dn->dn_zfetch, zs, missed, B_TRUE,
 		    (flags & DMU_UNCACHEDIO));
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (read) {
 		/* wait for async read i/o */
 		err = zio_wait(zio);
 		if (err) {
 			dmu_buf_rele_array(dbp, nblks, tag);
 			return (err);
 		}
 
 		/* wait for other io to complete */
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, const void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_buf_hold_array_by_dnode(DB_DNODE(db), offset, length, read,
 	    tag, numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, const void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch I/Os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefetched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.  If the range
  * is too long, prefetch the first dmu_prefetch_max bytes as requested, while
  * for the rest only a higher level, also fitting within dmu_prefetch_max.  It
  * should primarily help random reads, since for long sequential reads there is
  * a speculative prefetcher.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not
  * in cache, they will be asynchronously read in.  Dnode read by dnode_hold()
  * is currently synchronous.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 
 	if (dmu_prefetch_max == 0 || len == 0) {
 		dmu_prefetch_dnode(os, object, pri);
 		return;
 	}
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return;
 
 	dmu_prefetch_by_dnode(dn, level, offset, len, pri);
 
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_prefetch_by_dnode(dnode_t *dn, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	int64_t level2 = level;
 	uint64_t start, end, start2, end2;
 
 	/*
 	 * Depending on len we may do two prefetches: blocks [start, end) at
 	 * level, and following blocks [start2, end2) at higher level2.
 	 */
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift != 0) {
 		/*
 		 * The object has multiple blocks.  Calculate the full range
 		 * of blocks [start, end2) and then split it into two parts,
 		 * so that the first [start, end) fits into dmu_prefetch_max.
 		 */
 		start = dbuf_whichblock(dn, level, offset);
 		end2 = dbuf_whichblock(dn, level, offset + len - 1) + 1;
 		uint8_t ibs = dn->dn_indblkshift;
 		uint8_t bs = (level == 0) ? dn->dn_datablkshift : ibs;
 		uint_t limit = P2ROUNDUP(dmu_prefetch_max, 1 << bs) >> bs;
 		start2 = end = MIN(end2, start + limit);
 
 		/*
 		 * Find level2 where [start2, end2) fits into dmu_prefetch_max.
 		 */
 		uint8_t ibps = ibs - SPA_BLKPTRSHIFT;
 		limit = P2ROUNDUP(dmu_prefetch_max, 1 << ibs) >> ibs;
 		do {
 			level2++;
 			start2 = P2ROUNDUP(start2, 1 << ibps) >> ibps;
 			end2 = P2ROUNDUP(end2, 1 << ibps) >> ibps;
 		} while (end2 - start2 > limit);
 	} else {
 		/* There is only one block.  Prefetch it or nothing. */
 		start = start2 = end2 = 0;
 		end = start + (level == 0 && offset < dn->dn_datablksz);
 	}
 
 	for (uint64_t i = start; i < end; i++)
 		dbuf_prefetch(dn, level, i, pri, 0);
 	for (uint64_t i = start2; i < end2; i++)
 		dbuf_prefetch(dn, level2, i, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 typedef struct {
 	kmutex_t	dpa_lock;
 	kcondvar_t	dpa_cv;
 	uint64_t	dpa_pending_io;
 } dmu_prefetch_arg_t;
 
 static void
 dmu_prefetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t issued)
 {
 	(void) level; (void) blkid; (void)issued;
 	dmu_prefetch_arg_t *dpa = arg;
 
 	ASSERT0(level);
 
 	mutex_enter(&dpa->dpa_lock);
 	ASSERT3U(dpa->dpa_pending_io, >, 0);
 	if (--dpa->dpa_pending_io == 0)
 		cv_broadcast(&dpa->dpa_cv);
 	mutex_exit(&dpa->dpa_lock);
 }
 
 static void
 dmu_prefetch_wait_by_dnode(dnode_t *dn, uint64_t offset, uint64_t len)
 {
 	dmu_prefetch_arg_t dpa;
 
 	mutex_init(&dpa.dpa_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dpa.dpa_cv, NULL, CV_DEFAULT, NULL);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	uint64_t start = dbuf_whichblock(dn, 0, offset);
 	uint64_t end = dbuf_whichblock(dn, 0, offset + len - 1) + 1;
 	dpa.dpa_pending_io = end - start;
 
 	for (uint64_t blk = start; blk < end; blk++) {
 		(void) dbuf_prefetch_impl(dn, 0, blk, ZIO_PRIORITY_ASYNC_READ,
 		    0, dmu_prefetch_done, &dpa);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for prefetch L0 reads to finish */
 	mutex_enter(&dpa.dpa_lock);
 	while (dpa.dpa_pending_io > 0) {
 		cv_wait(&dpa.dpa_cv, &dpa.dpa_lock);
 
 	}
 	mutex_exit(&dpa.dpa_lock);
 
 	mutex_destroy(&dpa.dpa_lock);
 	cv_destroy(&dpa.dpa_cv);
 }
 
 /*
  * Issue prefetch I/Os for the given L0 block range and wait for the I/O
  * to complete. This does not enforce dmu_prefetch_max and will prefetch
  * the entire range. The blocks are read from disk into the ARC but no
  * decompression occurs (i.e., the dbuf cache is not required).
  */
 int
 dmu_prefetch_wait(objset_t *os, uint64_t object, uint64_t offset, uint64_t size)
 {
 	dnode_t *dn;
 	int err = 0;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	/*
 	 * Chunk the requests (16 indirects worth) so that we can be interrupted
 	 */
 	uint64_t chunksize;
 	if (dn->dn_indblkshift) {
 		uint64_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 		chunksize = (nbps * 16) << dn->dn_datablkshift;
 	} else {
 		chunksize = dn->dn_datablksz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, chunksize);
 
 		dmu_prefetch_wait_by_dnode(dn, offset, mylen);
 
 		offset += mylen;
 		size -= mylen;
 
 		if (issig()) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 	}
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 /*
  * Issue prefetch I/Os for the given object's dnode.
  */
 void
 dmu_prefetch_dnode(objset_t *os, uint64_t object, zio_priority_t pri)
 {
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return;
 
 	dnode_t *dn = DMU_META_DNODE(os);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	uint64_t blkid = dbuf_whichblock(dn, 0, object * sizeof (dnode_phys_t));
 	dbuf_prefetch(dn, 0, blkid, pri, 0);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crash in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed and l1blks is set to the number of level 1
  * indirect blocks found within the chunk.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
 	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange = (uint64_t)dn->dn_datablksz *
 	    EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	/* dn_nlevels == 1 means we don't have any L1 blocks */
 	if (dn->dn_nlevels <= 1) {
 		*l1blks = 0;
 		*start = minimum;
 		return (0);
 	}
 
 	/*
 	 * Check if we can free the entire range assuming that all of the
 	 * L1 blocks in this range have data. If we can, we use this
 	 * worst case value as an estimate so we can avoid having to look
 	 * at the object's actual data.
 	 */
 	uint64_t total_l1blks =
 	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
 	    iblkrange;
 	if (total_l1blks <= maxblks) {
 		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			*l1blks = blks;
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN_TYPED(*start, iblkrange, uint64_t);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	*l1blks = blks;
 
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #else
 	(void) os;
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (dn == NULL)
 		return (SET_ERROR(EINVAL));
 
 	object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		uint64_t txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&dp->dp_lock);
 		uint64_t long_free_dirty =
 		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees, wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees.
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty >= dirty_frees_threshold) {
 			DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay);
 			dmu_tx_commit(tx);
 			txg_wait_open(dp, 0, B_TRUE);
 			continue;
 		}
 
 		/*
 		 * In order to prevent unnecessary write throttling, for each
 		 * TXG, we track the cumulative size of L1 blocks being dirtied
 		 * in dnode_free_range() below. We compare this number to a
 		 * tunable threshold, past which we prevent new L1 dirty freeing
 		 * blocks from being added into the open TXG. See
 		 * dmu_free_long_range_impl() for details. The threshold
 		 * prevents write throttle activation due to dirty freeing L1
 		 * blocks taking up a large percentage of zfs_dirty_data_max.
 		 */
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
 		    l1blks << dn->dn_indblkshift;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty, uint64_t, chunk_len,
 		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err == 0) {
 		err = dmu_object_free(os, object, tx);
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, dmu_flags_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block. If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		uint64_t newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		memset((char *)buf + newsz, 0, size - newsz);
 		size = newsz;
 	}
 
 	if (size == 0)
 		return (0);
 
 	/* Allow Direct I/O when requested and properly aligned */
 	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned(buf) &&
 	    zfs_dio_aligned(offset, size, PAGESIZE)) {
 		abd_t *data = abd_get_from_buf(buf, size);
 		err = dmu_read_abd(dn, offset, size, data, flags);
 		abd_free(data);
 		return (err);
 	}
 	flags &= ~DMU_DIRECTIO;
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			uint64_t tocpy;
 			int64_t bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = MIN(db->db_size - bufoff, size);
 
 			ASSERT(db->db_data != NULL);
 			(void) memcpy(buf, (char *)db->db_data + bufoff, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, dmu_flags_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     dmu_flags_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size) {
 			dmu_buf_will_fill_flags(db, tx, B_FALSE, flags);
 		} else {
 			if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
 				if (bufoff == 0)
 					flags |= DMU_PARTIAL_FIRST;
 				else
 					flags |= DMU_PARTIAL_MORE;
 			}
 			dmu_buf_will_dirty_flags(db, tx, flags);
 		}
 
 		ASSERT(db->db_data != NULL);
 		(void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx, B_FALSE);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx, DMU_READ_PREFETCH);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 int
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx, dmu_flags_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int error;
 
 	if (size == 0)
 		return (0);
 
 	/* Allow Direct I/O when requested and properly aligned */
 	if ((flags & DMU_DIRECTIO) && zfs_dio_page_aligned((void *)buf) &&
 	    zfs_dio_aligned(offset, size, dn->dn_datablksz)) {
 		abd_t *data = abd_get_from_buf((void *)buf, size);
 		error = dmu_write_abd(dn, offset, size, data, flags, tx);
 		abd_free(data);
 		return (error);
 	}
 	flags &= ~DMU_DIRECTIO;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, flags));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx, flags);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (0);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 void
 dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	int numbufs, i;
 	dmu_buf_t **dbp;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG,
 	    &numbufs, &dbp));
 	for (i = 0; i < numbufs; i++)
 		dmu_buf_redact(dbp[i], tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 #ifdef _KERNEL
 int
 dmu_read_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size,
     dmu_flags_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 
 	if (uio->uio_extflg & UIO_DIRECT)
 		return (dmu_read_uio_direct(dn, uio, size, flags));
 	flags &= ~DMU_DIRECTIO;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), size,
 	    TRUE, FTAG, &numbufs, &dbp, flags);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = zfs_uio_offset(uio) - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, size);
 
 		ASSERT(db->db_data != NULL);
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy,
 		    UIO_READ, uio);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_flags_t flags)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	err = dmu_read_uio_dnode(DB_DNODE(db), uio, size, flags);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_flags_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size, flags);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx,
     dmu_flags_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	uint64_t write_size;
 	dmu_flags_t oflags = flags;
 
 top:
 	write_size = size;
 
 	/*
 	 * We only allow Direct I/O writes to happen if we are block
 	 * sized aligned. Otherwise, we pass the write off to the ARC.
 	 */
 	if ((uio->uio_extflg & UIO_DIRECT) &&
 	    (write_size >= dn->dn_datablksz)) {
 		if (zfs_dio_aligned(zfs_uio_offset(uio), write_size,
 		    dn->dn_datablksz)) {
 			return (dmu_write_uio_direct(dn, uio, size, flags, tx));
 		} else if (write_size > dn->dn_datablksz &&
 		    zfs_dio_offset_aligned(zfs_uio_offset(uio),
 		    dn->dn_datablksz)) {
 			write_size =
 			    dn->dn_datablksz * (write_size / dn->dn_datablksz);
 			err = dmu_write_uio_direct(dn, uio, write_size, flags,
 			    tx);
 			if (err == 0) {
 				size -= write_size;
 				goto top;
 			} else {
 				return (err);
 			}
 		} else {
 			write_size =
 			    P2PHASE(zfs_uio_offset(uio), dn->dn_datablksz);
 		}
 	}
 	flags &= ~DMU_DIRECTIO;
 
 	err = dmu_buf_hold_array_by_dnode(dn, zfs_uio_offset(uio), write_size,
 	    FALSE, FTAG, &numbufs, &dbp, flags);
 	if (err)
 		return (err);
 
 	for (int i = 0; i < numbufs; i++) {
 		uint64_t tocpy;
 		int64_t bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(write_size > 0);
 
 		offset_t off = zfs_uio_offset(uio);
 		bufoff = off - db->db_offset;
 		tocpy = MIN(db->db_size - bufoff, write_size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size) {
 			dmu_buf_will_fill_flags(db, tx, B_TRUE, flags);
 		} else {
 			if (i == numbufs - 1 && bufoff + tocpy < db->db_size) {
 				if (bufoff == 0)
 					flags |= DMU_PARTIAL_FIRST;
 				else
 					flags |= DMU_PARTIAL_MORE;
 			}
 			dmu_buf_will_dirty_flags(db, tx, flags);
 		}
 
 		ASSERT(db->db_data != NULL);
 		err = zfs_uio_fault_move((char *)db->db_data + bufoff,
 		    tocpy, UIO_WRITE, uio);
 
 		if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
 			/* The fill was reverted.  Undo any uio progress. */
 			zfs_uio_advance(uio, off - zfs_uio_offset(uio));
 		}
 
 		if (err)
 			break;
 
 		write_size -= tocpy;
 		size -= tocpy;
 	}
 
 	IMPLY(err == 0, write_size == 0);
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	if ((uio->uio_extflg & UIO_DIRECT) && size > 0) {
 		flags = oflags;
 		goto top;
 	}
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset zfs_uio_offset(uio).
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx, dmu_flags_t flags)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	err = dmu_write_uio_dnode(DB_DNODE(db), uio, size, tx, flags);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset zfs_uio_offset(uio).
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, zfs_uio_t *uio, uint64_t size,
     dmu_tx_t *tx, dmu_flags_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx, flags);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 #endif /* _KERNEL */
 
 static void
 dmu_cached_bps(spa_t *spa, blkptr_t *bps, uint_t nbps,
     uint64_t *l1sz, uint64_t *l2sz)
 {
 	int cached_flags;
 
 	if (bps == NULL)
 		return;
 
 	for (size_t blk_off = 0; blk_off < nbps; blk_off++) {
 		blkptr_t *bp = &bps[blk_off];
 
 		if (BP_IS_HOLE(bp))
 			continue;
 
 		cached_flags = arc_cached(spa, bp);
 		if (cached_flags == 0)
 			continue;
 
 		if ((cached_flags & (ARC_CACHED_IN_L1 | ARC_CACHED_IN_L2)) ==
 		    ARC_CACHED_IN_L2)
 			*l2sz += BP_GET_LSIZE(bp);
 		else
 			*l1sz += BP_GET_LSIZE(bp);
 	}
 }
 
 /*
  * Estimate DMU object cached size.
  */
 int
 dmu_object_cached_size(objset_t *os, uint64_t object,
     uint64_t *l1sz, uint64_t *l2sz)
 {
 	dnode_t *dn;
 	dmu_object_info_t doi;
 	int err = 0;
 
 	*l1sz = *l2sz = 0;
 
 	if (dnode_hold(os, object, FTAG, &dn) != 0)
 		return (0);
 
 	if (dn->dn_nlevels < 2) {
 		dnode_rele(dn, FTAG);
 		return (0);
 	}
 
 	dmu_object_info_from_dnode(dn, &doi);
 
 	for (uint64_t off = 0; off < doi.doi_max_offset;
 	    off += dmu_prefetch_max) {
 		/* dbuf_read doesn't prefetch L1 blocks. */
 		dmu_prefetch_by_dnode(dn, 1, off,
 		    dmu_prefetch_max, ZIO_PRIORITY_SYNC_READ);
 	}
 
 	/*
 	 * Hold all valid L1 blocks, asking ARC the status of each BP
 	 * contained in each such L1 block.
 	 */
 	uint_t nbps = bp_span_in_blocks(dn->dn_indblkshift, 1);
 	uint64_t l1blks = 1 + (dn->dn_maxblkid / nbps);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	for (uint64_t blk = 0; blk < l1blks; blk++) {
 		dmu_buf_impl_t *db = NULL;
 
 		if (issig()) {
 			/*
 			 * On interrupt, get out, and bubble up EINTR
 			 */
 			err = EINTR;
 			break;
 		}
 
 		/*
 		 * If we get an i/o error here, the L1 can't be read,
 		 * and nothing under it could be cached, so we just
 		 * continue. Ignoring the error from dbuf_hold_impl
 		 * or from dbuf_read is then a reasonable choice.
 		 */
 		err = dbuf_hold_impl(dn, 1, blk, B_TRUE, B_FALSE, FTAG, &db);
 		if (err != 0) {
 			/*
 			 * ignore error and continue
 			 */
 			err = 0;
 			continue;
 		}
 
 		err = dbuf_read(db, NULL, DB_RF_CANFAIL);
 		if (err == 0) {
 			dmu_cached_bps(dmu_objset_spa(os), db->db.db_data,
 			    nbps, l1sz, l2sz);
 		}
 		/*
 		 * error may be ignored, and we continue
 		 */
 		err = 0;
 		dbuf_rele(db, FTAG);
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * A "lightweight" write is faster than a regular write (e.g.
  * dmu_write_by_dnode() or dmu_assign_arcbuf_by_dnode()), because it avoids the
  * CPU cost of creating a dmu_buf_impl_t and arc_buf_[hdr_]_t.  However, the
  * data can not be read or overwritten until the transaction's txg has been
  * synced.  This makes it appropriate for workloads that are known to be
  * (temporarily) write-only, like "zfs receive".
  *
  * A single block is written, starting at the specified offset in bytes.  If
  * the call is successful, it returns 0 and the provided abd has been
  * consumed (the caller should not free it).
  */
 int
 dmu_lightweight_write_by_dnode(dnode_t *dn, uint64_t offset, abd_t *abd,
     const zio_prop_t *zp, zio_flag_t flags, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr =
 	    dbuf_dirty_lightweight(dn, dbuf_whichblock(dn, 0, offset), tx);
 	if (dr == NULL)
 		return (SET_ERROR(EIO));
 	dr->dt.dll.dr_abd = abd;
 	dr->dt.dll.dr_props = *zp;
 	dr->dt.dll.dr_flags = flags;
 	return (0);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 int
 dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx, dmu_flags_t flags)
 {
 	dmu_buf_impl_t *db;
 	objset_t *os = dn->dn_objset;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	db = dbuf_hold(dn, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 
 	/*
 	 * We can only assign if the offset is aligned and the arc buf is the
 	 * same size as the dbuf.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		zfs_racct_write(os->os_spa, blksz, 1, flags);
 		dbuf_assign_arcbuf(db, buf, tx, flags);
 		dbuf_rele(db, FTAG);
 	} else {
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		dbuf_rele(db, FTAG);
 		dmu_write_by_dnode(dn, offset, blksz, buf->b_data, tx, flags);
 		dmu_return_arcbuf(buf);
 	}
 
 	return (0);
 }
 
 int
 dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx, dmu_flags_t flags)
 {
 	int err;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	DB_DNODE_ENTER(db);
 	err = dmu_assign_arcbuf_by_dnode(DB_DNODE(db), offset, buf, tx, flags);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 
 	if (zio->io_error == 0) {
 		dbuf_dirty_record_t *dr = dsa->dsa_dr;
 		blkptr_t *bp = zio->io_bp;
 
 		if (BP_IS_HOLE(bp)) {
 			dmu_buf_t *db = NULL;
 			if (dr)
 				db = &(dr->dr_dbuf->db);
 			else
 				db = dsa->dsa_zgd->zgd_db;
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			BP_SET_FILL(bp, 1);
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	(void) buf;
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	/*
 	 * Record the vdev(s) backing this blkptr so they can be flushed after
 	 * the writes for the lwb have completed.
 	 */
 	if (zgd && zio->io_error == 0) {
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 	}
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		ASSERT0(dr->dt.dl.dr_has_raw_params);
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			VERIFY(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 		dr->dt.dl.dr_gang_copies = zio->io_prop.zp_gang_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    BP_GET_LOGICAL_BIRTH(&dr->dt.dl.dr_overridden_by) == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	if (dsa->dsa_done)
 		dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	zgd_t *zgd = dsa->dsa_zgd;
 
 	if (zio->io_error == 0) {
 		/*
 		 * Record the vdev(s) backing this blkptr so they can be
 		 * flushed after the writes for the lwb have completed.
 		 */
 		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 
 		if (!BP_IS_HOLE(bp)) {
 			blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig;
 			ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 			ASSERT(BP_GET_LOGICAL_BIRTH(zio->io_bp) == zio->io_txg);
 			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_free(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 	int error;
 
 	error = dbuf_read((dmu_buf_impl_t *)zgd->zgd_db, NULL,
 	    DB_RF_CANFAIL | DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
 	if (error != 0)
 		return (error);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	/*
 	 * This transaction does not produce any dirty data or log blocks, so
 	 * it should not be throttled.  All other cases wait for TXG sync, by
 	 * which time the log block we are writing will be obsolete, so we can
 	 * skip waiting and just return error here instead.
 	 */
 	if (dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr, *dr_next;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dmu_write_policy(os, DB_DNODE(db), db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = dbuf_find_dirty_eq(db, txg);
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	dr_next = list_next(&db->db_dirty_records, dr);
 	ASSERT(dr_next == NULL || dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	if (dr_next != NULL) {
 		zp.zp_nopwrite = B_FALSE;
 	} else {
 		DB_DNODE_ENTER(db);
 		if (dnode_block_freed(DB_DNODE(db), db->db_blkid))
 			zp.zp_nopwrite = B_FALSE;
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT0(dr->dt.dl.dr_has_raw_params);
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp,
 	    dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db),
 	    dbuf_is_l2cacheable(db, NULL), &zp, dmu_sync_ready, NULL,
 	    dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL,
 	    &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_nlevels(dn, nlevels, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 static const int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	uint8_t complevel = os->os_complevel;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	boolean_t encrypt = B_FALSE;
 	int copies = os->os_copies;
 	int gang_copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
 		 * XXX -- we should design a compression algorithm
 		 * that specializes in arrays of bps.
 		 */
 		compress = zio_compress_select(os->os_spa,
 		    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		switch (os->os_redundant_metadata) {
 		case ZFS_REDUNDANT_METADATA_ALL:
 			copies++;
 			gang_copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_MOST:
 			if (level >= zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				copies++;
 			if (level + 1 >=
 			    zfs_redundant_metadata_most_ditto_level ||
 			    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))
 				gang_copies++;
 			break;
 		case ZFS_REDUNDANT_METADATA_SOME:
-			if (DMU_OT_IS_CRITICAL(type)) {
+			if (DMU_OT_IS_CRITICAL(type, level)) {
 				copies++;
 				gang_copies++;
 			} else if (DMU_OT_IS_METADATA(type)) {
 				gang_copies++;
 			}
 			break;
 		case ZFS_REDUNDANT_METADATA_NONE:
 			break;
 		}
 
 		if (dmu_ddt_copies > 0) {
 			/*
 			 * If this tuneable is set, and this is a write for a
 			 * dedup entry store (zap or log), then we treat it
 			 * something like ZFS_REDUNDANT_METADATA_MOST on a
 			 * regular dataset: this many copies, and one more for
 			 * "higher" indirect blocks. This specific exception is
 			 * necessary because dedup objects are stored in the
 			 * MOS, which always has the highest possible copies.
 			 */
 			dmu_object_type_t stype =
 			    dn ? dn->dn_storage_type : DMU_OT_NONE;
 			if (stype == DMU_OT_NONE)
 				stype = type;
 			if (stype == DMU_OT_DDT_ZAP) {
 				copies = dmu_ddt_copies;
 				if (level >=
 				    zfs_redundant_metadata_most_ditto_level)
 					copies++;
 			}
 		}
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 		complevel = zio_complevel_select(os->os_spa, compress,
 		    complevel, complevel);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checksum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 
 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
 		    (os->os_redundant_metadata ==
 		    ZFS_REDUNDANT_METADATA_MOST &&
 		    zfs_redundant_metadata_most_ditto_level <= 1))
 			gang_copies++;
 	}
 
 	/*
 	 * All objects in an encrypted objset are protected from modification
 	 * via a MAC. Encrypted objects store their IV and salt in the last DVA
 	 * in the bp, so we cannot use all copies. Encrypted objects are also
 	 * not subject to nopwrite since writing the same data will still
 	 * result in a new ciphertext. Only encrypted blocks can be dedup'd
 	 * to avoid ambiguity in the dedup code since the DDT does not store
 	 * object types.
 	 */
 	if (os->os_encrypted && (wp & WP_NOFILL) == 0) {
 		encrypt = B_TRUE;
 
 		if (DMU_OT_IS_ENCRYPTED(type)) {
 			copies = MIN(copies, SPA_DVAS_PER_BP - 1);
 			gang_copies = MIN(gang_copies, SPA_DVAS_PER_BP - 1);
 			nopwrite = B_FALSE;
 		} else {
 			dedup = B_FALSE;
 		}
 
 		if (level <= 0 &&
 		    (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) {
 			compress = ZIO_COMPRESS_EMPTY;
 		}
 	}
 
 	zp->zp_compress = compress;
 	zp->zp_complevel = complevel;
 	zp->zp_checksum = checksum;
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_gang_copies = MIN(gang_copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 	zp->zp_encrypt = encrypt;
 	zp->zp_byteorder = ZFS_HOST_BYTEORDER;
 	zp->zp_direct_write = (wp & WP_DIRECT_WR) ? B_TRUE : B_FALSE;
 	memset(zp->zp_salt, 0, ZIO_DATA_SALT_LEN);
 	memset(zp->zp_iv, 0, ZIO_DATA_IV_LEN);
 	memset(zp->zp_mac, 0, ZIO_DATA_MAC_LEN);
 	zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ?
 	    os->os_zpl_special_smallblock : 0;
 	zp->zp_storage_type = dn ? dn->dn_storage_type : DMU_OT_NONE;
 
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 }
 
 /*
  * Reports the location of data and holes in an object.  In order to
  * accurately report holes all dirty data must be synced to disk.  This
  * causes extremely poor performance when seeking for holes in a dirty file.
  * As a compromise, only provide hole data when the dnode is clean.  When
  * a dnode is dirty report the dnode as having no holes by returning EBUSY
  * which is always safe to do.
  */
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int restarted = 0, err;
 
 restart:
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	if (dnode_is_dirty(dn)) {
 		/*
 		 * If the zfs_dmu_offset_next_sync module option is enabled
 		 * then hole reporting has been requested.  Dirty dnodes
 		 * must be synced to disk to accurately report holes.
 		 *
 		 * Provided a RL_READER rangelock spanning 0-UINT64_MAX is
 		 * held by the caller only a single restart will be required.
 		 * We tolerate callers which do not hold the rangelock by
 		 * returning EBUSY and not reporting holes after one restart.
 		 */
 		if (zfs_dmu_offset_next_sync) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dnode_rele(dn, FTAG);
 
 			if (restarted)
 				return (SET_ERROR(EBUSY));
 
 			txg_wait_synced(dmu_objset_pool(os), 0);
 			restarted = 1;
 			goto restart;
 		}
 
 		err = SET_ERROR(EBUSY);
 	} else {
 		err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK |
 		    (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     blkptr_t *bps, size_t *nbpsp)
 {
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	blkptr_t *bp;
 	int error, numbufs;
 
 	error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp);
 	if (error != 0) {
 		if (error == ESRCH) {
 			error = SET_ERROR(ENXIO);
 		}
 		return (error);
 	}
 
 	ASSERT3U(numbufs, <=, *nbpsp);
 
 	for (int i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 
 		mutex_enter(&db->db_mtx);
 
 		if (!list_is_empty(&db->db_dirty_records)) {
 			dbuf_dirty_record_t *dr;
 
 			dr = list_head(&db->db_dirty_records);
 			if (dr->dt.dl.dr_brtwrite) {
 				/*
 				 * This is very special case where we clone a
 				 * block and in the same transaction group we
 				 * read its BP (most likely to clone the clone).
 				 */
 				bp = &dr->dt.dl.dr_overridden_by;
 			} else {
 				/*
 				 * The block was modified in the same
 				 * transaction group.
 				 */
 				mutex_exit(&db->db_mtx);
 				error = SET_ERROR(EAGAIN);
 				goto out;
 			}
 		} else {
 			bp = db->db_blkptr;
 		}
 
 		mutex_exit(&db->db_mtx);
 
 		if (bp == NULL) {
 			/*
 			 * The file size was increased, but the block was never
 			 * written, otherwise we would either have the block
 			 * pointer or the dirty record and would not get here.
 			 * It is effectively a hole, so report it as such.
 			 */
 			BP_ZERO(&bps[i]);
 			continue;
 		}
 		/*
 		 * Make sure we clone only data blocks.
 		 */
 		if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * If the block was allocated in transaction group that is not
 		 * yet synced, we could clone it, but we couldn't write this
 		 * operation into ZIL, or it may be impossible to replay, since
 		 * the block may appear not yet allocated at that point.
 		 */
 		if (BP_GET_BIRTH(bp) > spa_freeze_txg(os->os_spa)) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		if (BP_GET_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) {
 			error = SET_ERROR(EAGAIN);
 			goto out;
 		}
 
 		bps[i] = *bp;
 	}
 
 	*nbpsp = numbufs;
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 int
 dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
     dmu_tx_t *tx, const blkptr_t *bps, size_t nbps)
 {
 	spa_t *spa;
 	dmu_buf_t **dbp, *dbuf;
 	dmu_buf_impl_t *db;
 	struct dirty_leaf *dl;
 	dbuf_dirty_record_t *dr;
 	const blkptr_t *bp;
 	int error = 0, i, numbufs;
 
 	spa = os->os_spa;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
 	    &numbufs, &dbp));
 	ASSERT3U(nbps, ==, numbufs);
 
 	/*
 	 * Before we start cloning make sure that the dbufs sizes match new BPs
 	 * sizes. If they don't, that's a no-go, as we are not able to shrink
 	 * dbufs.
 	 */
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		ASSERT3U(db->db.db_object, !=, DMU_META_DNODE_OBJECT);
 		ASSERT0(db->db_level);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(db->db_blkid != DMU_SPILL_BLKID);
 
 		if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) {
 			error = SET_ERROR(EXDEV);
 			goto out;
 		}
 	}
 
 	for (i = 0; i < numbufs; i++) {
 		dbuf = dbp[i];
 		db = (dmu_buf_impl_t *)dbuf;
 		bp = &bps[i];
 
 		dmu_buf_will_clone_or_dio(dbuf, tx);
 
 		mutex_enter(&db->db_mtx);
 
 		dr = list_head(&db->db_dirty_records);
 		VERIFY(dr != NULL);
 		ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
 		dl = &dr->dt.dl;
 		ASSERT0(dl->dr_has_raw_params);
 		dl->dr_overridden_by = *bp;
 		if (!BP_IS_HOLE(bp) || BP_GET_LOGICAL_BIRTH(bp) != 0) {
 			if (!BP_IS_EMBEDDED(bp)) {
 				BP_SET_BIRTH(&dl->dr_overridden_by, dr->dr_txg,
 				    BP_GET_BIRTH(bp));
 			} else {
 				BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by,
 				    dr->dr_txg);
 			}
 		}
 		dl->dr_brtwrite = B_TRUE;
 		dl->dr_override_state = DR_OVERRIDDEN;
 
 		mutex_exit(&db->db_mtx);
 
 		/*
 		 * When data in embedded into BP there is no need to create
 		 * BRT entry as there is no data block. Just copy the BP as
 		 * it contains the data.
 		 */
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			brt_pending_add(spa, bp, tx);
 		}
 	}
 out:
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (error);
 }
 
 void
 __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	__dmu_object_info_from_dnode(dn, doi);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add in number of slots used for the dnode itself */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + dn->dn_num_slots;
 	DB_DNODE_EXIT(db);
 }
 
 void
 dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	*dnsize = DB_DNODE(db)->dn_num_slots << DNODE_SHIFT;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 	(void) vbuf, (void) size;
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	dmu_tx_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	dmu_tx_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
 
 EXPORT_SYMBOL(dmu_bonus_hold);
 EXPORT_SYMBOL(dmu_bonus_hold_by_dnode);
 EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus);
 EXPORT_SYMBOL(dmu_buf_rele_array);
 EXPORT_SYMBOL(dmu_prefetch);
 EXPORT_SYMBOL(dmu_prefetch_by_dnode);
 EXPORT_SYMBOL(dmu_prefetch_dnode);
 EXPORT_SYMBOL(dmu_free_range);
 EXPORT_SYMBOL(dmu_free_long_range);
 EXPORT_SYMBOL(dmu_free_long_object);
 EXPORT_SYMBOL(dmu_read);
 EXPORT_SYMBOL(dmu_read_by_dnode);
 EXPORT_SYMBOL(dmu_read_uio);
 EXPORT_SYMBOL(dmu_read_uio_dbuf);
 EXPORT_SYMBOL(dmu_read_uio_dnode);
 EXPORT_SYMBOL(dmu_write);
 EXPORT_SYMBOL(dmu_write_by_dnode);
 EXPORT_SYMBOL(dmu_write_uio);
 EXPORT_SYMBOL(dmu_write_uio_dbuf);
 EXPORT_SYMBOL(dmu_write_uio_dnode);
 EXPORT_SYMBOL(dmu_prealloc);
 EXPORT_SYMBOL(dmu_object_info);
 EXPORT_SYMBOL(dmu_object_info_from_dnode);
 EXPORT_SYMBOL(dmu_object_info_from_db);
 EXPORT_SYMBOL(dmu_object_size_from_db);
 EXPORT_SYMBOL(dmu_object_dnsize_from_db);
 EXPORT_SYMBOL(dmu_object_set_nlevels);
 EXPORT_SYMBOL(dmu_object_set_blocksize);
 EXPORT_SYMBOL(dmu_object_set_maxblkid);
 EXPORT_SYMBOL(dmu_object_set_checksum);
 EXPORT_SYMBOL(dmu_object_set_compress);
 EXPORT_SYMBOL(dmu_offset_next);
 EXPORT_SYMBOL(dmu_write_policy);
 EXPORT_SYMBOL(dmu_sync);
 EXPORT_SYMBOL(dmu_request_arcbuf);
 EXPORT_SYMBOL(dmu_return_arcbuf);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode);
 EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf);
 EXPORT_SYMBOL(dmu_buf_hold);
 EXPORT_SYMBOL(dmu_ot);
 
 ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW,
 	"Enable NOP writes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, UINT, ZMOD_RW,
 	"Percentage of dirtied blocks from frees in one TXG");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW,
 	"Enable forcing txg sync to find holes");
 
 ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW,
 	"Limit one prefetch call to this size");
 
 ZFS_MODULE_PARAM(zfs, , dmu_ddt_copies, UINT, ZMOD_RW,
 	"Override copies= for dedup objects");