diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META
index 14cfc5f00a41..40376428ba95 100644
--- a/sys/contrib/openzfs/META
+++ b/sys/contrib/openzfs/META
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.2.5
+Version:       2.2.6
 Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
-Linux-Maximum: 6.9
+Linux-Maximum: 6.10
 Linux-Minimum: 3.10
diff --git a/sys/contrib/openzfs/config/kernel-blk-queue.m4 b/sys/contrib/openzfs/config/kernel-blk-queue.m4
index 2f0b386e6637..a064140f337a 100644
--- a/sys/contrib/openzfs/config/kernel-blk-queue.m4
+++ b/sys/contrib/openzfs/config/kernel-blk-queue.m4
@@ -1,433 +1,461 @@
 dnl #
 dnl # 2.6.39 API change,
 dnl # blk_start_plug() and blk_finish_plug()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG], [
 	ZFS_LINUX_TEST_SRC([blk_plug], [
 		#include <linux/blkdev.h>
 	],[
 		struct blk_plug plug __attribute__ ((unused));
 
 		blk_start_plug(&plug);
 		blk_finish_plug(&plug);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [
 	AC_MSG_CHECKING([whether struct blk_plug is available])
 	ZFS_LINUX_TEST_RESULT([blk_plug], [
 		AC_MSG_RESULT(yes)
 	],[
 		ZFS_LINUX_TEST_ERROR([blk_plug])
 	])
 ])
 
 dnl #
 dnl # 2.6.32 - 4.11: statically allocated bdi in request_queue
 dnl # 4.12: dynamically allocated bdi in request_queue
+dnl # 6.11: bdi no longer available through request_queue, so get it from
+dnl #       the gendisk attached to the queue
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [
 	ZFS_LINUX_TEST_SRC([blk_queue_bdi], [
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue q;
 		struct backing_dev_info bdi;
 		q.backing_dev_info = &bdi;
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [
 	AC_MSG_CHECKING([whether blk_queue bdi is dynamic])
 	ZFS_LINUX_TEST_RESULT([blk_queue_bdi], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_QUEUE_BDI_DYNAMIC, 1,
 		    [blk queue backing_dev_info is dynamic])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
+AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI], [
+	ZFS_LINUX_TEST_SRC([blk_queue_disk_bdi], [
+		#include <linux/blkdev.h>
+		#include <linux/backing-dev.h>
+	], [
+		struct request_queue q;
+		struct gendisk disk;
+		struct backing_dev_info bdi __attribute__ ((unused));
+		q.disk = &disk;
+		q.disk->bdi = &bdi;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI], [
+	AC_MSG_CHECKING([whether backing_dev_info is available through queue gendisk])
+	ZFS_LINUX_TEST_RESULT([blk_queue_disk_bdi], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_BLK_QUEUE_DISK_BDI, 1,
+		    [backing_dev_info is available through queue gendisk])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
 dnl #
 dnl # 5.9: added blk_queue_update_readahead(),
 dnl # 5.15: renamed to disk_update_readahead()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD], [
 	ZFS_LINUX_TEST_SRC([blk_queue_update_readahead], [
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue q;
 		blk_queue_update_readahead(&q);
 	])
 
 	ZFS_LINUX_TEST_SRC([disk_update_readahead], [
 		#include <linux/blkdev.h>
 	],[
 		struct gendisk disk;
 		disk_update_readahead(&disk);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD], [
 	AC_MSG_CHECKING([whether blk_queue_update_readahead() exists])
 	ZFS_LINUX_TEST_RESULT([blk_queue_update_readahead], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_QUEUE_UPDATE_READAHEAD, 1,
 		    [blk_queue_update_readahead() exists])
 	],[
 		AC_MSG_RESULT(no)
 
 		AC_MSG_CHECKING([whether disk_update_readahead() exists])
 		ZFS_LINUX_TEST_RESULT([disk_update_readahead], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_DISK_UPDATE_READAHEAD, 1,
 			    [disk_update_readahead() exists])
 		],[
 			AC_MSG_RESULT(no)
 		])
 	])
 ])
 
 dnl #
 dnl # 5.19: bdev_max_discard_sectors() available
 dnl # 2.6.32: blk_queue_discard() available
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD], [
 	ZFS_LINUX_TEST_SRC([bdev_max_discard_sectors], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		unsigned int error __attribute__ ((unused));
 
 		error = bdev_max_discard_sectors(bdev);
 	])
 
 	ZFS_LINUX_TEST_SRC([blk_queue_discard], [
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue r;
 		struct request_queue *q = &r;
 		int value __attribute__ ((unused));
 		memset(q, 0, sizeof(r));
 		value = blk_queue_discard(q);
 	],[-Wframe-larger-than=8192])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [
 	AC_MSG_CHECKING([whether bdev_max_discard_sectors() is available])
 	ZFS_LINUX_TEST_RESULT([bdev_max_discard_sectors], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BDEV_MAX_DISCARD_SECTORS, 1,
 		    [bdev_max_discard_sectors() is available])
 	],[
 		AC_MSG_RESULT(no)
 
 		AC_MSG_CHECKING([whether blk_queue_discard() is available])
 		ZFS_LINUX_TEST_RESULT([blk_queue_discard], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLK_QUEUE_DISCARD, 1,
 			    [blk_queue_discard() is available])
 		],[
 			ZFS_LINUX_TEST_ERROR([blk_queue_discard])
 		])
 	])
 ])
 
 dnl #
 dnl # 5.19: bdev_max_secure_erase_sectors() available
 dnl # 4.8: blk_queue_secure_erase() available
 dnl # 2.6.36: blk_queue_secdiscard() available
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE], [
 	ZFS_LINUX_TEST_SRC([bdev_max_secure_erase_sectors], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		unsigned int error __attribute__ ((unused));
 
 		error = bdev_max_secure_erase_sectors(bdev);
 	])
 
 	ZFS_LINUX_TEST_SRC([blk_queue_secure_erase], [
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue r;
 		struct request_queue *q = &r;
 		int value __attribute__ ((unused));
 		memset(q, 0, sizeof(r));
 		value = blk_queue_secure_erase(q);
 	],[-Wframe-larger-than=8192])
 
 	ZFS_LINUX_TEST_SRC([blk_queue_secdiscard], [
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue r;
 		struct request_queue *q = &r;
 		int value __attribute__ ((unused));
 		memset(q, 0, sizeof(r));
 		value = blk_queue_secdiscard(q);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE], [
 	AC_MSG_CHECKING([whether bdev_max_secure_erase_sectors() is available])
 	ZFS_LINUX_TEST_RESULT([bdev_max_secure_erase_sectors], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BDEV_MAX_SECURE_ERASE_SECTORS, 1,
 		    [bdev_max_secure_erase_sectors() is available])
 	],[
 		AC_MSG_RESULT(no)
 
 		AC_MSG_CHECKING([whether blk_queue_secure_erase() is available])
 		ZFS_LINUX_TEST_RESULT([blk_queue_secure_erase], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLK_QUEUE_SECURE_ERASE, 1,
 			    [blk_queue_secure_erase() is available])
 		],[
 			AC_MSG_RESULT(no)
 
 			AC_MSG_CHECKING([whether blk_queue_secdiscard() is available])
 			ZFS_LINUX_TEST_RESULT([blk_queue_secdiscard], [
 				AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLK_QUEUE_SECDISCARD, 1,
 				    [blk_queue_secdiscard() is available])
 			],[
 				ZFS_LINUX_TEST_ERROR([blk_queue_secure_erase])
 			])
 		])
 	])
 ])
 
 dnl #
 dnl # 4.16 API change,
 dnl # Introduction of blk_queue_flag_set and blk_queue_flag_clear
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET], [
 	ZFS_LINUX_TEST_SRC([blk_queue_flag_set], [
 		#include <linux/kernel.h>
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue *q = NULL;
 		blk_queue_flag_set(0, q);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET], [
 	AC_MSG_CHECKING([whether blk_queue_flag_set() exists])
 	ZFS_LINUX_TEST_RESULT([blk_queue_flag_set], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_QUEUE_FLAG_SET, 1,
 		    [blk_queue_flag_set() exists])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_CLEAR], [
 	ZFS_LINUX_TEST_SRC([blk_queue_flag_clear], [
 		#include <linux/kernel.h>
 		#include <linux/blkdev.h>
 	],[
 		struct request_queue *q = NULL;
 		blk_queue_flag_clear(0, q);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR], [
 	AC_MSG_CHECKING([whether blk_queue_flag_clear() exists])
 	ZFS_LINUX_TEST_RESULT([blk_queue_flag_clear], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_QUEUE_FLAG_CLEAR, 1,
 		    [blk_queue_flag_clear() exists])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 2.6.36 API change,
 dnl # Added blk_queue_flush() interface, while the previous interface
 dnl # was available to all the new one is GPL-only.  Thus in addition to
 dnl # detecting if this function is available we determine if it is
 dnl # GPL-only.  If the GPL-only interface is there we implement our own
 dnl # compatibility function, otherwise we use the function.  The hope
 dnl # is that long term this function will be opened up.
 dnl #
 dnl # 4.7 API change,
 dnl # Replace blk_queue_flush with blk_queue_write_cache
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH], [
 	ZFS_LINUX_TEST_SRC([blk_queue_flush], [
 		#include <linux/blkdev.h>
 	], [
 		struct request_queue *q __attribute__ ((unused)) = NULL;
 		(void) blk_queue_flush(q, REQ_FLUSH);
 	], [], [ZFS_META_LICENSE])
 
 	ZFS_LINUX_TEST_SRC([blk_queue_write_cache], [
 		#include <linux/kernel.h>
 		#include <linux/blkdev.h>
 	], [
 		struct request_queue *q __attribute__ ((unused)) = NULL;
 		blk_queue_write_cache(q, true, true);
 	], [], [ZFS_META_LICENSE])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLUSH], [
 	AC_MSG_CHECKING([whether blk_queue_flush() is available])
 	ZFS_LINUX_TEST_RESULT([blk_queue_flush], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_QUEUE_FLUSH, 1,
 		    [blk_queue_flush() is available])
 
 		AC_MSG_CHECKING([whether blk_queue_flush() is GPL-only])
 		ZFS_LINUX_TEST_RESULT([blk_queue_flush_license], [
 			AC_MSG_RESULT(no)
 		],[
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY, 1,
 			    [blk_queue_flush() is GPL-only])
 		])
 	],[
 		AC_MSG_RESULT(no)
 	])
 
 	dnl #
 	dnl # 4.7 API change
 	dnl # Replace blk_queue_flush with blk_queue_write_cache
 	dnl #
 	AC_MSG_CHECKING([whether blk_queue_write_cache() exists])
 	ZFS_LINUX_TEST_RESULT([blk_queue_write_cache], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE, 1,
 		    [blk_queue_write_cache() exists])
 
 		AC_MSG_CHECKING([whether blk_queue_write_cache() is GPL-only])
 		ZFS_LINUX_TEST_RESULT([blk_queue_write_cache_license], [
 			AC_MSG_RESULT(no)
 		],[
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY, 1,
 			    [blk_queue_write_cache() is GPL-only])
 		])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 2.6.34 API change
 dnl # blk_queue_max_hw_sectors() replaces blk_queue_max_sectors().
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS], [
 	ZFS_LINUX_TEST_SRC([blk_queue_max_hw_sectors], [
 		#include <linux/blkdev.h>
 	], [
 		struct request_queue *q __attribute__ ((unused)) = NULL;
 		(void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 	], [])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [
 	AC_MSG_CHECKING([whether blk_queue_max_hw_sectors() is available])
 	ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [
 		AC_MSG_RESULT(yes)
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 2.6.34 API change
 dnl # blk_queue_max_segments() consolidates blk_queue_max_hw_segments()
 dnl # and blk_queue_max_phys_segments().
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS], [
 	ZFS_LINUX_TEST_SRC([blk_queue_max_segments], [
 		#include <linux/blkdev.h>
 	], [
 		struct request_queue *q __attribute__ ((unused)) = NULL;
 		(void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS);
 	], [])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [
 	AC_MSG_CHECKING([whether blk_queue_max_segments() is available])
 	ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [
 		AC_MSG_RESULT(yes)
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # See if kernel supports block multi-queue and blk_status_t.
 dnl # blk_status_t represents the new status codes introduced in the 4.13
 dnl # kernel patch:
 dnl #
 dnl #  block: introduce new block status code type
 dnl #
 dnl # We do not currently support the "old" block multi-queue interfaces from
 dnl # prior kernels.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_MQ], [
 	ZFS_LINUX_TEST_SRC([blk_mq], [
 		#include <linux/blk-mq.h>
 	], [
 		struct blk_mq_tag_set tag_set __attribute__ ((unused)) = {0};
 		(void) blk_mq_alloc_tag_set(&tag_set);
 		return BLK_STS_OK;
 	], [])
 	ZFS_LINUX_TEST_SRC([blk_mq_rq_hctx], [
 		#include <linux/blk-mq.h>
 		#include <linux/blkdev.h>
 	], [
 		struct request rq = {0};
 		struct blk_mq_hw_ctx *hctx = NULL;
 		rq.mq_hctx = hctx;
 	], [])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_MQ], [
 	AC_MSG_CHECKING([whether block multiqueue with blk_status_t is available])
 	ZFS_LINUX_TEST_RESULT([blk_mq], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_MQ, 1, [block multiqueue is available])
 		AC_MSG_CHECKING([whether block multiqueue hardware context is cached in struct request])
 		ZFS_LINUX_TEST_RESULT([blk_mq_rq_hctx], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLK_MQ_RQ_HCTX, 1, [block multiqueue hardware context is cached in struct request])
 		], [
 			AC_MSG_RESULT(no)
 		])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_CLEAR
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS
 	ZFS_AC_KERNEL_SRC_BLK_MQ
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [
 	ZFS_AC_KERNEL_BLK_QUEUE_PLUG
 	ZFS_AC_KERNEL_BLK_QUEUE_BDI
+	ZFS_AC_KERNEL_BLK_QUEUE_DISK_BDI
 	ZFS_AC_KERNEL_BLK_QUEUE_UPDATE_READAHEAD
 	ZFS_AC_KERNEL_BLK_QUEUE_DISCARD
 	ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE
 	ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET
 	ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR
 	ZFS_AC_KERNEL_BLK_QUEUE_FLUSH
 	ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS
 	ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS
 	ZFS_AC_KERNEL_BLK_MQ
 ])
diff --git a/sys/contrib/openzfs/config/kernel-make-request-fn.m4 b/sys/contrib/openzfs/config/kernel-make-request-fn.m4
index 9813ad2fb3f3..4c54bdd6d4a2 100644
--- a/sys/contrib/openzfs/config/kernel-make-request-fn.m4
+++ b/sys/contrib/openzfs/config/kernel-make-request-fn.m4
@@ -1,213 +1,234 @@
 dnl #
 dnl # Check for make_request_fn interface.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [
 	ZFS_LINUX_TEST_SRC([make_request_fn_void], [
 		#include <linux/blkdev.h>
 		static void make_request(struct request_queue *q,
 		    struct bio *bio) { return; }
 	],[
 		blk_queue_make_request(NULL, &make_request);
 	])
 
 	ZFS_LINUX_TEST_SRC([make_request_fn_blk_qc_t], [
 		#include <linux/blkdev.h>
 		static blk_qc_t make_request(struct request_queue *q,
 		    struct bio *bio) { return (BLK_QC_T_NONE); }
 	],[
 		blk_queue_make_request(NULL, &make_request);
 	])
 
 	ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn], [
 		#include <linux/blkdev.h>
 		static blk_qc_t make_request(struct request_queue *q,
 		    struct bio *bio) { return (BLK_QC_T_NONE); }
 	],[
 		struct request_queue *q __attribute__ ((unused));
 		q = blk_alloc_queue(make_request, NUMA_NO_NODE);
 	])
 
 	ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn_rh], [
 		#include <linux/blkdev.h>
 		static blk_qc_t make_request(struct request_queue *q,
 		    struct bio *bio) { return (BLK_QC_T_NONE); }
 	],[
 		struct request_queue *q __attribute__ ((unused));
 		q = blk_alloc_queue_rh(make_request, NUMA_NO_NODE);
 	])
 
 	ZFS_LINUX_TEST_SRC([block_device_operations_submit_bio], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device_operations o;
 		o.submit_bio = NULL;
 	])
 
 	ZFS_LINUX_TEST_SRC([blk_alloc_disk], [
 		#include <linux/blkdev.h>
 	],[
 		struct gendisk *disk  __attribute__ ((unused));
 		disk = blk_alloc_disk(NUMA_NO_NODE);
 	])
 
 	ZFS_LINUX_TEST_SRC([blk_alloc_disk_2arg], [
 		#include <linux/blkdev.h>
 	],[
 		struct queue_limits *lim = NULL;
 		struct gendisk *disk  __attribute__ ((unused));
 		disk = blk_alloc_disk(lim, NUMA_NO_NODE);
 	])
 
+	ZFS_LINUX_TEST_SRC([blkdev_queue_limits_features], [
+		#include <linux/blkdev.h>
+	],[
+		struct queue_limits *lim = NULL;
+		lim->features = 0;
+	])
+
 	ZFS_LINUX_TEST_SRC([blk_cleanup_disk], [
 		#include <linux/blkdev.h>
 	],[
 		struct gendisk *disk  __attribute__ ((unused));
 		blk_cleanup_disk(disk);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [
 	dnl # Checked as part of the blk_alloc_queue_request_fn test
 	dnl #
 	dnl # Linux 5.9 API Change
 	dnl # make_request_fn was moved into block_device_operations->submit_bio
 	dnl #
 	AC_MSG_CHECKING([whether submit_bio is member of struct block_device_operations])
 	ZFS_LINUX_TEST_RESULT([block_device_operations_submit_bio], [
 		AC_MSG_RESULT(yes)
 
 		AC_DEFINE(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS, 1,
 		    [submit_bio is member of struct block_device_operations])
 
 		dnl #
 		dnl # Linux 5.14 API Change:
 		dnl # blk_alloc_queue() + alloc_disk() combo replaced by
 		dnl # a single call to blk_alloc_disk().
 		dnl #
 		AC_MSG_CHECKING([whether blk_alloc_disk() exists])
 		ZFS_LINUX_TEST_RESULT([blk_alloc_disk], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE([HAVE_BLK_ALLOC_DISK], 1, [blk_alloc_disk() exists])
 
 			dnl #
 			dnl # 5.20 API change,
 			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
 			dnl #
 			AC_MSG_CHECKING([whether blk_cleanup_disk() exists])
 			ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [
 				AC_MSG_RESULT(yes)
 				AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1,
 				    [blk_cleanup_disk() exists])
 			], [
 				AC_MSG_RESULT(no)
 			])
 		], [
 			AC_MSG_RESULT(no)
 		])
 
 		dnl #
 		dnl # Linux 6.9 API Change:
 		dnl # blk_alloc_queue() takes a nullable queue_limits arg.
 		dnl #
 		AC_MSG_CHECKING([whether blk_alloc_disk() exists and takes 2 args])
 		ZFS_LINUX_TEST_RESULT([blk_alloc_disk_2arg], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE([HAVE_BLK_ALLOC_DISK_2ARG], 1, [blk_alloc_disk() exists and takes 2 args])
 
+			dnl #
+			dnl # Linux 6.11 API change:
+			dnl # struct queue_limits gains a 'features' field,
+			dnl # used to set flushing options
+			dnl #
+			AC_MSG_CHECKING([whether struct queue_limits has a features field])
+			ZFS_LINUX_TEST_RESULT([blkdev_queue_limits_features], [
+				AC_MSG_RESULT(yes)
+				AC_DEFINE([HAVE_BLKDEV_QUEUE_LIMITS_FEATURES], 1,
+				    [struct queue_limits has a features field])
+			], [
+				AC_MSG_RESULT(no)
+			])
+
 			dnl #
 			dnl # 5.20 API change,
 			dnl # Removed blk_cleanup_disk(), put_disk() should be used.
 			dnl #
 			AC_MSG_CHECKING([whether blk_cleanup_disk() exists])
 			ZFS_LINUX_TEST_RESULT([blk_cleanup_disk], [
 				AC_MSG_RESULT(yes)
 				AC_DEFINE([HAVE_BLK_CLEANUP_DISK], 1,
 				    [blk_cleanup_disk() exists])
 			], [
 				AC_MSG_RESULT(no)
 			])
 		], [
 			AC_MSG_RESULT(no)
 		])
 	],[
 		AC_MSG_RESULT(no)
 
 		dnl # Checked as part of the blk_alloc_queue_request_fn test
 		dnl #
 		dnl # Linux 5.7 API Change
 		dnl # blk_alloc_queue() expects request function.
 		dnl #
 		AC_MSG_CHECKING([whether blk_alloc_queue() expects request function])
 		ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn], [
 			AC_MSG_RESULT(yes)
 
 			dnl # This is currently always the case.
 			AC_MSG_CHECKING([whether make_request_fn() returns blk_qc_t])
 			AC_MSG_RESULT(yes)
 
 			AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN, 1,
 			    [blk_alloc_queue() expects request function])
 			AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t,
 			    [make_request_fn() return type])
 			AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1,
 			    [Noting that make_request_fn() returns blk_qc_t])
 		],[
 			dnl #
 			dnl # CentOS Stream 4.18.0-257 API Change
 			dnl # The Linux 5.7 blk_alloc_queue() change was back-
 			dnl # ported and the symbol renamed blk_alloc_queue_rh().
 			dnl # As of this kernel version they're not providing
 			dnl # any compatibility code in the kernel for this.
 			dnl #
 			ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn_rh], [
 				AC_MSG_RESULT(yes)
 
 				dnl # This is currently always the case.
 				AC_MSG_CHECKING([whether make_request_fn_rh() returns blk_qc_t])
 				AC_MSG_RESULT(yes)
 
 				AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH, 1,
 				    [blk_alloc_queue_rh() expects request function])
 				AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t,
 				    [make_request_fn() return type])
 				AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1,
 				    [Noting that make_request_fn() returns blk_qc_t])
 			],[
 				AC_MSG_RESULT(no)
 
 				dnl #
 				dnl # Linux 3.2 API Change
 				dnl # make_request_fn returns void.
 				dnl #
 				AC_MSG_CHECKING(
 				    [whether make_request_fn() returns void])
 				ZFS_LINUX_TEST_RESULT([make_request_fn_void], [
 					AC_MSG_RESULT(yes)
 					AC_DEFINE(MAKE_REQUEST_FN_RET, void,
 					    [make_request_fn() return type])
 					AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_VOID, 1,
 					    [Noting that make_request_fn() returns void])
 				],[
 					AC_MSG_RESULT(no)
 
 					dnl #
 					dnl # Linux 4.4 API Change
 					dnl # make_request_fn returns blk_qc_t.
 					dnl #
 					AC_MSG_CHECKING(
 					    [whether make_request_fn() returns blk_qc_t])
 					ZFS_LINUX_TEST_RESULT([make_request_fn_blk_qc_t], [
 						AC_MSG_RESULT(yes)
 						AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t,
 						    [make_request_fn() return type])
 						AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1,
 						    [Noting that make_request_fn() ]
 						    [returns blk_qc_t])
 					],[
 						ZFS_LINUX_TEST_ERROR([make_request_fn])
 					])
 				])
 			])
 		])
 	])
 ])
diff --git a/sys/contrib/openzfs/config/kernel-mm-page-size.m4 b/sys/contrib/openzfs/config/kernel-mm-page-size.m4
deleted file mode 100644
index d5ebd926986a..000000000000
--- a/sys/contrib/openzfs/config/kernel-mm-page-size.m4
+++ /dev/null
@@ -1,17 +0,0 @@
-AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
-	ZFS_LINUX_TEST_SRC([page_size], [
-		#include <linux/mm.h>
-	],[
-		unsigned long s;
-		s = page_size(NULL);
-	])
-])
-AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
-	AC_MSG_CHECKING([whether page_size() is available])
-	ZFS_LINUX_TEST_RESULT([page_size], [
-		AC_MSG_RESULT(yes)
-		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
-	],[
-		AC_MSG_RESULT(no)
-	])
-])
diff --git a/sys/contrib/openzfs/config/kernel-mm-pagemap.m4 b/sys/contrib/openzfs/config/kernel-mm-pagemap.m4
new file mode 100644
index 000000000000..466b6fa07d9a
--- /dev/null
+++ b/sys/contrib/openzfs/config/kernel-mm-pagemap.m4
@@ -0,0 +1,36 @@
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE], [
+	ZFS_LINUX_TEST_SRC([page_size], [
+		#include <linux/mm.h>
+	],[
+		unsigned long s;
+		s = page_size(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_SIZE], [
+	AC_MSG_CHECKING([whether page_size() is available])
+	ZFS_LINUX_TEST_RESULT([page_size], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_SIZE, 1, [page_size() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
+
+
+AC_DEFUN([ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING], [
+	ZFS_LINUX_TEST_SRC([page_mapping], [
+		#include <linux/pagemap.h>
+	],[
+		struct page *p = NULL;
+		struct address_space *m = page_mapping(NULL);
+	])
+])
+AC_DEFUN([ZFS_AC_KERNEL_MM_PAGE_MAPPING], [
+	AC_MSG_CHECKING([whether page_mapping() is available])
+	ZFS_LINUX_TEST_RESULT([page_mapping], [
+		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_MM_PAGE_MAPPING, 1, [page_mapping() is available])
+	],[
+		AC_MSG_RESULT(no)
+	])
+])
diff --git a/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4 b/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4
index a5e934f56d29..12ffe9d95142 100644
--- a/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4
+++ b/sys/contrib/openzfs/config/kernel-register_sysctl_table.m4
@@ -1,27 +1,86 @@
 dnl #
 dnl # Linux 6.5 removes register_sysctl_table
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [
 	ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [
 		#include <linux/sysctl.h>
 
 		static struct ctl_table dummy_table[] = {
 			{}
 		};
 
     ],[
 		struct ctl_table_header *h
 			__attribute((unused)) = register_sysctl_table(dummy_table);
     ])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [
 	AC_MSG_CHECKING([whether register_sysctl_table exists])
 	ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [
 		AC_MSG_RESULT([yes])
 		AC_DEFINE(HAVE_REGISTER_SYSCTL_TABLE, 1,
 			[register_sysctl_table exists])
 	],[
 		AC_MSG_RESULT([no])
 	])
 ])
+
+dnl #
+dnl # Linux 6.11 register_sysctl() enforces that sysctl tables no longer
+dnl # supply a sentinel end-of-table element. 6.6 introduces
+dnl # register_sysctl_sz() to enable callers to choose, so we use it if
+dnl # available for backward compatibility.
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ], [
+	ZFS_LINUX_TEST_SRC([has_register_sysctl_sz], [
+		#include <linux/sysctl.h>
+	],[
+		struct ctl_table test_table[] __attribute__((unused)) = {0};
+		register_sysctl_sz("", test_table, 0);
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ], [
+	AC_MSG_CHECKING([whether register_sysctl_sz exists])
+	ZFS_LINUX_TEST_RESULT([has_register_sysctl_sz], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_REGISTER_SYSCTL_SZ, 1,
+			[register_sysctl_sz exists])
+	],[
+		AC_MSG_RESULT([no])
+	])
+])
+
+dnl #
+dnl # Linux 6.11 makes const the ctl_table arg of proc_handler
+dnl #
+AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST], [
+	ZFS_LINUX_TEST_SRC([has_proc_handler_ctl_table_const], [
+		#include <linux/sysctl.h>
+
+		static int test_handler(
+		    const struct ctl_table *ctl __attribute((unused)),
+		    int write __attribute((unused)),
+		    void *buffer __attribute((unused)),
+		    size_t *lenp __attribute((unused)),
+		    loff_t *ppos __attribute((unused)))
+		{
+			return (0);
+		}
+	], [
+		proc_handler *ph __attribute((unused)) =
+		    &test_handler;
+	])
+])
+
+AC_DEFUN([ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST], [
+	AC_MSG_CHECKING([whether proc_handler ctl_table arg is const])
+	ZFS_LINUX_TEST_RESULT([has_proc_handler_ctl_table_const], [
+		AC_MSG_RESULT([yes])
+		AC_DEFINE(HAVE_PROC_HANDLER_CTL_TABLE_CONST, 1,
+		    [proc_handler ctl_table arg is const])
+	], [
+		AC_MSG_RESULT([no])
+	])
+])
diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4
index b51477b6a951..f0cd76fd7325 100644
--- a/sys/contrib/openzfs/config/kernel.m4
+++ b/sys/contrib/openzfs/config/kernel.m4
@@ -1,1048 +1,1054 @@
 dnl #
 dnl # Default ZFS kernel configuration
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	AM_COND_IF([BUILD_LINUX], [
 		dnl # Setup the kernel build environment.
 		ZFS_AC_KERNEL
 		ZFS_AC_QAT
 
 		dnl # Sanity checks for module building and CONFIG_* defines
 		ZFS_AC_KERNEL_CONFIG_DEFINED
 		ZFS_AC_MODULE_SYMVERS
 
 		dnl # Sequential ZFS_LINUX_TRY_COMPILE tests
 		ZFS_AC_KERNEL_FPU_HEADER
 		ZFS_AC_KERNEL_OBJTOOL_HEADER
 		ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T
 		ZFS_AC_KERNEL_MISC_MINOR
 		ZFS_AC_KERNEL_DECLARE_EVENT_CLASS
 
 		dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests
 		ZFS_AC_KERNEL_TEST_SRC
 		ZFS_AC_KERNEL_TEST_RESULT
 
 		AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
 			KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ"
 		])
 
 		AC_SUBST(KERNEL_MAKE)
 	])
 ])
 
 dnl #
 dnl # Generate and compile all of the kernel API test cases to determine
 dnl # which interfaces are available.  By invoking the kernel build system
 dnl # only once the compilation can be done in parallel significantly
 dnl # speeding up the process.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_TYPES
 	ZFS_AC_KERNEL_SRC_OBJTOOL
 	ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE
 	ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
 	ZFS_AC_KERNEL_SRC_PDE_DATA
 	ZFS_AC_KERNEL_SRC_FALLOCATE
 	ZFS_AC_KERNEL_SRC_FADVISE
 	ZFS_AC_KERNEL_SRC_GENERIC_FADVISE
 	ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
 	ZFS_AC_KERNEL_SRC_RWSEM
 	ZFS_AC_KERNEL_SRC_SCHED
 	ZFS_AC_KERNEL_SRC_USLEEP_RANGE
 	ZFS_AC_KERNEL_SRC_KMEM_CACHE
 	ZFS_AC_KERNEL_SRC_KVMALLOC
 	ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL
 	ZFS_AC_KERNEL_SRC_WAIT
 	ZFS_AC_KERNEL_SRC_INODE_TIMES
 	ZFS_AC_KERNEL_SRC_INODE_LOCK
 	ZFS_AC_KERNEL_SRC_GROUP_INFO_GID
 	ZFS_AC_KERNEL_SRC_RW
 	ZFS_AC_KERNEL_SRC_TIMER_SETUP
 	ZFS_AC_KERNEL_SRC_SUPER_USER_NS
 	ZFS_AC_KERNEL_SRC_PROC_OPERATIONS
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS
 	ZFS_AC_KERNEL_SRC_BIO
 	ZFS_AC_KERNEL_SRC_BLKDEV
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE
 	ZFS_AC_KERNEL_SRC_GENHD_FLAGS
 	ZFS_AC_KERNEL_SRC_REVALIDATE_DISK
 	ZFS_AC_KERNEL_SRC_GET_DISK_RO
 	ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL
 	ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY
 	ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE
 	ZFS_AC_KERNEL_SRC_XATTR
 	ZFS_AC_KERNEL_SRC_ACL
 	ZFS_AC_KERNEL_SRC_INODE_SETATTR
 	ZFS_AC_KERNEL_SRC_INODE_GETATTR
 	ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS
 	ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION
 	ZFS_AC_KERNEL_SRC_SHOW_OPTIONS
 	ZFS_AC_KERNEL_SRC_FILE_INODE
 	ZFS_AC_KERNEL_SRC_FILE_DENTRY
 	ZFS_AC_KERNEL_SRC_FSYNC
 	ZFS_AC_KERNEL_SRC_AIO_FSYNC
 	ZFS_AC_KERNEL_SRC_EVICT_INODE
 	ZFS_AC_KERNEL_SRC_DIRTY_INODE
 	ZFS_AC_KERNEL_SRC_SHRINKER
 	ZFS_AC_KERNEL_SRC_MKDIR
 	ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS
 	ZFS_AC_KERNEL_SRC_CREATE
 	ZFS_AC_KERNEL_SRC_PERMISSION
 	ZFS_AC_KERNEL_SRC_GET_LINK
 	ZFS_AC_KERNEL_SRC_PUT_LINK
 	ZFS_AC_KERNEL_SRC_TMPFILE
 	ZFS_AC_KERNEL_SRC_AUTOMOUNT
 	ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE
 	ZFS_AC_KERNEL_SRC_COMMIT_METADATA
 	ZFS_AC_KERNEL_SRC_CLEAR_INODE
 	ZFS_AC_KERNEL_SRC_SETATTR_PREPARE
 	ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED
 	ZFS_AC_KERNEL_SRC_DENTRY
 	ZFS_AC_KERNEL_SRC_DENTRY_ALIAS_D_U
 	ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SRC_SECURITY_INODE
 	ZFS_AC_KERNEL_SRC_FST_MOUNT
 	ZFS_AC_KERNEL_SRC_BDI
 	ZFS_AC_KERNEL_SRC_SET_NLINK
 	ZFS_AC_KERNEL_SRC_SGET
 	ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE
 	ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_GETATTR
 	ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_SRC_VFS_ITERATE
 	ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_SRC_VFS_READPAGES
 	ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND
 	ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN
 	ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT
 	ZFS_AC_KERNEL_SRC_FPU
 	ZFS_AC_KERNEL_SRC_FMODE_T
 	ZFS_AC_KERNEL_SRC_KUIDGID_T
 	ZFS_AC_KERNEL_SRC_KUID_HELPERS
 	ZFS_AC_KERNEL_SRC_RENAME
 	ZFS_AC_KERNEL_SRC_CURRENT_TIME
 	ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES
 	ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL
 	ZFS_AC_KERNEL_SRC_KTIME
 	ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC
 	ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES
 	ZFS_AC_KERNEL_SRC_KSTRTOUL
 	ZFS_AC_KERNEL_SRC_PERCPU
 	ZFS_AC_KERNEL_SRC_CPU_HOTPLUG
 	ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR
 	ZFS_AC_KERNEL_SRC_MKNOD
 	ZFS_AC_KERNEL_SRC_SYMLINK
 	ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS
 	ZFS_AC_KERNEL_SRC_SIGNAL_STOP
 	ZFS_AC_KERNEL_SRC_SIGINFO
 	ZFS_AC_KERNEL_SRC_SYSFS
 	ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE
 	ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG
 	ZFS_AC_KERNEL_SRC_STRLCPY
 	ZFS_AC_KERNEL_SRC_STRSCPY
 	ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT
 	ZFS_AC_KERNEL_SRC_ADD_DISK
 	ZFS_AC_KERNEL_SRC_KTHREAD
 	ZFS_AC_KERNEL_SRC_ZERO_PAGE
 	ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC
 	ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM
 	ZFS_AC_KERNEL_SRC_IDMAP_MNT_API
 	ZFS_AC_KERNEL_SRC_IDMAP_NO_USERNS
 	ZFS_AC_KERNEL_SRC_IATTR_VFSID
 	ZFS_AC_KERNEL_SRC_FILEMAP
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
+	ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
 			ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
 			;;
 		riscv*)
 			ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
 			;;
 	esac
 
 	AC_MSG_CHECKING([for available kernel interfaces])
 	ZFS_LINUX_TEST_COMPILE_ALL([kabi])
 	AC_MSG_RESULT([done])
 ])
 
 dnl #
 dnl # Check results of kernel interface tests.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_TYPES
 	ZFS_AC_KERNEL_ACCESS_OK_TYPE
 	ZFS_AC_KERNEL_GLOBAL_PAGE_STATE
 	ZFS_AC_KERNEL_OBJTOOL
 	ZFS_AC_KERNEL_PDE_DATA
 	ZFS_AC_KERNEL_FALLOCATE
 	ZFS_AC_KERNEL_FADVISE
 	ZFS_AC_KERNEL_GENERIC_FADVISE
 	ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE
 	ZFS_AC_KERNEL_RWSEM
 	ZFS_AC_KERNEL_SCHED
 	ZFS_AC_KERNEL_USLEEP_RANGE
 	ZFS_AC_KERNEL_KMEM_CACHE
 	ZFS_AC_KERNEL_KVMALLOC
 	ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL
 	ZFS_AC_KERNEL_WAIT
 	ZFS_AC_KERNEL_INODE_TIMES
 	ZFS_AC_KERNEL_INODE_LOCK
 	ZFS_AC_KERNEL_GROUP_INFO_GID
 	ZFS_AC_KERNEL_RW
 	ZFS_AC_KERNEL_TIMER_SETUP
 	ZFS_AC_KERNEL_SUPER_USER_NS
 	ZFS_AC_KERNEL_PROC_OPERATIONS
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS
 	ZFS_AC_KERNEL_BIO
 	ZFS_AC_KERNEL_BLKDEV
 	ZFS_AC_KERNEL_BLK_QUEUE
 	ZFS_AC_KERNEL_GENHD_FLAGS
 	ZFS_AC_KERNEL_REVALIDATE_DISK
 	ZFS_AC_KERNEL_GET_DISK_RO
 	ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL
 	ZFS_AC_KERNEL_DISCARD_GRANULARITY
 	ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE
 	ZFS_AC_KERNEL_XATTR
 	ZFS_AC_KERNEL_ACL
 	ZFS_AC_KERNEL_INODE_SETATTR
 	ZFS_AC_KERNEL_INODE_GETATTR
 	ZFS_AC_KERNEL_INODE_SET_FLAGS
 	ZFS_AC_KERNEL_INODE_SET_IVERSION
 	ZFS_AC_KERNEL_SHOW_OPTIONS
 	ZFS_AC_KERNEL_FILE_INODE
 	ZFS_AC_KERNEL_FILE_DENTRY
 	ZFS_AC_KERNEL_FSYNC
 	ZFS_AC_KERNEL_AIO_FSYNC
 	ZFS_AC_KERNEL_EVICT_INODE
 	ZFS_AC_KERNEL_DIRTY_INODE
 	ZFS_AC_KERNEL_SHRINKER
 	ZFS_AC_KERNEL_MKDIR
 	ZFS_AC_KERNEL_LOOKUP_FLAGS
 	ZFS_AC_KERNEL_CREATE
 	ZFS_AC_KERNEL_PERMISSION
 	ZFS_AC_KERNEL_GET_LINK
 	ZFS_AC_KERNEL_PUT_LINK
 	ZFS_AC_KERNEL_TMPFILE
 	ZFS_AC_KERNEL_AUTOMOUNT
 	ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE
 	ZFS_AC_KERNEL_COMMIT_METADATA
 	ZFS_AC_KERNEL_CLEAR_INODE
 	ZFS_AC_KERNEL_SETATTR_PREPARE
 	ZFS_AC_KERNEL_INSERT_INODE_LOCKED
 	ZFS_AC_KERNEL_DENTRY
 	ZFS_AC_KERNEL_DENTRY_ALIAS_D_U
 	ZFS_AC_KERNEL_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SECURITY_INODE
 	ZFS_AC_KERNEL_FST_MOUNT
 	ZFS_AC_KERNEL_BDI
 	ZFS_AC_KERNEL_SET_NLINK
 	ZFS_AC_KERNEL_SGET
 	ZFS_AC_KERNEL_LSEEK_EXECUTE
 	ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_VFS_READ_FOLIO
 	ZFS_AC_KERNEL_VFS_GETATTR
 	ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_VFS_ITERATE
 	ZFS_AC_KERNEL_VFS_DIRECT_IO
 	ZFS_AC_KERNEL_VFS_READPAGES
 	ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_VFS_RW_ITERATE
 	ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND
 	ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_MAKE_REQUEST_FN
 	ZFS_AC_KERNEL_GENERIC_IO_ACCT
 	ZFS_AC_KERNEL_FPU
 	ZFS_AC_KERNEL_FMODE_T
 	ZFS_AC_KERNEL_KUIDGID_T
 	ZFS_AC_KERNEL_KUID_HELPERS
 	ZFS_AC_KERNEL_RENAME
 	ZFS_AC_KERNEL_CURRENT_TIME
 	ZFS_AC_KERNEL_USERNS_CAPABILITIES
 	ZFS_AC_KERNEL_IN_COMPAT_SYSCALL
 	ZFS_AC_KERNEL_KTIME
 	ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC
 	ZFS_AC_KERNEL_TOTALHIGH_PAGES
 	ZFS_AC_KERNEL_KSTRTOUL
 	ZFS_AC_KERNEL_PERCPU
 	ZFS_AC_KERNEL_CPU_HOTPLUG
 	ZFS_AC_KERNEL_GENERIC_FILLATTR
 	ZFS_AC_KERNEL_MKNOD
 	ZFS_AC_KERNEL_SYMLINK
 	ZFS_AC_KERNEL_BIO_MAX_SEGS
 	ZFS_AC_KERNEL_SIGNAL_STOP
 	ZFS_AC_KERNEL_SIGINFO
 	ZFS_AC_KERNEL_SYSFS
 	ZFS_AC_KERNEL_SET_SPECIAL_STATE
 	ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG
 	ZFS_AC_KERNEL_STRLCPY
 	ZFS_AC_KERNEL_STRSCPY
 	ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT
 	ZFS_AC_KERNEL_ADD_DISK
 	ZFS_AC_KERNEL_KTHREAD
 	ZFS_AC_KERNEL_ZERO_PAGE
 	ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC
 	ZFS_AC_KERNEL_USER_NS_COMMON_INUM
 	ZFS_AC_KERNEL_IDMAP_MNT_API
 	ZFS_AC_KERNEL_IDMAP_NO_USERNS
 	ZFS_AC_KERNEL_IATTR_VFSID
 	ZFS_AC_KERNEL_FILEMAP
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
+	ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
+	ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	ZFS_AC_KERNEL_MM_PAGE_SIZE
+	ZFS_AC_KERNEL_MM_PAGE_MAPPING
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
 			ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
 			;;
 		riscv*)
 			ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
 			;;
 	esac
 ])
 
 dnl #
 dnl # Detect name used for Module.symvers file in kernel
 dnl #
 AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [
 	modpost=$LINUX/scripts/Makefile.modpost
 	AC_MSG_CHECKING([kernel file name for module symbols])
 	AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [
 		AS_IF([grep -q Modules.symvers $modpost], [
 			LINUX_SYMBOLS=Modules.symvers
 		], [
 			LINUX_SYMBOLS=Module.symvers
 		])
 
 		AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [
 			AC_MSG_ERROR([
 	*** Please make sure the kernel devel package for your distribution
 	*** is installed.  If you are building with a custom kernel, make sure
 	*** the kernel is configured, built, and the '--with-linux=PATH'
 	*** configure option refers to the location of the kernel source.
 			])
 		])
 	], [
 		LINUX_SYMBOLS=NONE
 	])
 	AC_MSG_RESULT($LINUX_SYMBOLS)
 	AC_SUBST(LINUX_SYMBOLS)
 ])
 
 dnl #
 dnl # Detect the kernel to be built against
 dnl #
 dnl # Most modern Linux distributions have separate locations for bare
 dnl # source (source) and prebuilt (build) files. Additionally, there are
 dnl # `source` and `build` symlinks in `/lib/modules/$(KERNEL_VERSION)`
 dnl # pointing to them. The directory search order is now:
 dnl # 
 dnl # - `configure` command line values if both `--with-linux` and
 dnl #   `--with-linux-obj` were defined
 dnl # 
 dnl # - If only `--with-linux` was defined, `--with-linux-obj` is assumed
 dnl #   to have the same value as `--with-linux`
 dnl # 
 dnl # - If neither `--with-linux` nor `--with-linux-obj` were defined
 dnl #   autodetection is used:
 dnl # 
 dnl #   - `/lib/modules/$(uname -r)/{source,build}` respectively, if exist.
 dnl # 
 dnl #   - If only `/lib/modules/$(uname -r)/build` exists, it is assumed
 dnl #     to be both source and build directory.
 dnl # 
 dnl #   - The first directory in `/lib/modules` with the highest version
 dnl #     number according to `sort -V` which contains both `source` and
 dnl #     `build` symlinks/directories. If module directory contains only
 dnl #     `build` component, it is assumed to be both source and build
 dnl #     directory.
 dnl # 
 dnl #   - Last resort: the first directory matching `/usr/src/kernels/*`
 dnl #     and `/usr/src/linux-*` with the highest version number according
 dnl #     to `sort -V` is assumed to be both source and build directory.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL], [
 	AC_ARG_WITH([linux],
 		AS_HELP_STRING([--with-linux=PATH],
 		[Path to kernel source]),
 		[kernelsrc="$withval"])
 
 	AC_ARG_WITH(linux-obj,
 		AS_HELP_STRING([--with-linux-obj=PATH],
 		[Path to kernel build objects]),
 		[kernelbuild="$withval"])
 
 	AC_MSG_CHECKING([kernel source and build directories])
 	AS_IF([test -n "$kernelsrc" && test -z "$kernelbuild"], [
 		kernelbuild="$kernelsrc"
 	], [test -z "$kernelsrc"], [
 		AS_IF([test -e "/lib/modules/$(uname -r)/source" && \
 		       test -e "/lib/modules/$(uname -r)/build"], [
 			src="/lib/modules/$(uname -r)/source"
 			build="/lib/modules/$(uname -r)/build"
 		], [test -e "/lib/modules/$(uname -r)/build"], [
 			build="/lib/modules/$(uname -r)/build"
 			src="$build"
 		], [
 			src=
 
 			for d in $(ls -1d /lib/modules/* 2>/dev/null | sort -Vr); do
 				if test -e "$d/source" && test -e "$d/build"; then
 					src="$d/source"
 					build="$d/build"
 					break
 				fi
 
 				if test -e "$d/build"; then
 					src="$d/build"
 					build="$d/build"
 					break
 				fi
 			done
 
 			# the least reliable method
 			if test -z "$src"; then
 				src=$(ls -1d /usr/src/kernels/* /usr/src/linux-* \
 				      2>/dev/null | grep -v obj | sort -Vr | head -1)
 				build="$src"
 			fi
 		])
 
 		AS_IF([test -n "$src" && test -e "$src"], [
 			kernelsrc=$(readlink -e "$src")
 		], [
 			kernelsrc="[Not found]"
 		])
 		AS_IF([test -n "$build" && test -e "$build"], [
 			kernelbuild=$(readlink -e "$build")
 		], [
 			kernelbuild="[Not found]"
 		])
 	], [
 		AS_IF([test "$kernelsrc" = "NONE"], [
 			kernsrcver=NONE
 		])
 		withlinux=yes
 	])
 
 	AC_MSG_RESULT([done])
 	AC_MSG_CHECKING([kernel source directory])
 	AC_MSG_RESULT([$kernelsrc])
 	AC_MSG_CHECKING([kernel build directory])
 	AC_MSG_RESULT([$kernelbuild])
 	AS_IF([test ! -d "$kernelsrc" || test ! -d "$kernelbuild"], [
 		AC_MSG_ERROR([
 	*** Please make sure the kernel devel package for your distribution
 	*** is installed and then try again.  If that fails, you can specify the
 	*** location of the kernel source and build with the '--with-linux=PATH' and
 	*** '--with-linux-obj=PATH' options respectively.])
 	])
 
 	AC_MSG_CHECKING([kernel source version])
 	utsrelease1=$kernelbuild/include/linux/version.h
 	utsrelease2=$kernelbuild/include/linux/utsrelease.h
 	utsrelease3=$kernelbuild/include/generated/utsrelease.h
 	AS_IF([test -r $utsrelease1 && grep -qF UTS_RELEASE $utsrelease1], [
 		utsrelease=$utsrelease1
 	], [test -r $utsrelease2 && grep -qF UTS_RELEASE $utsrelease2], [
 		utsrelease=$utsrelease2
 	], [test -r $utsrelease3 && grep -qF UTS_RELEASE $utsrelease3], [
 		utsrelease=$utsrelease3
 	])
 
 	AS_IF([test -n "$utsrelease"], [
 		kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease)
 		AS_IF([test -z "$kernsrcver"], [
 			AC_MSG_RESULT([Not found])
 			AC_MSG_ERROR([
 	*** Cannot determine kernel version.
 			])
 		])
 	], [
 		AC_MSG_RESULT([Not found])
 		if test "x$enable_linux_builtin" != xyes; then
 			AC_MSG_ERROR([
 	*** Cannot find UTS_RELEASE definition.
 			])
 		else
 			AC_MSG_ERROR([
 	*** Cannot find UTS_RELEASE definition.
 	*** Please run 'make prepare' inside the kernel source tree.])
 		fi
 	])
 
 	AC_MSG_RESULT([$kernsrcver])
 
 	AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [
 		 AC_MSG_ERROR([
 	*** Cannot build against kernel version $kernsrcver.
 	*** The minimum supported kernel version is $ZFS_META_KVER_MIN.
 		])
 	])
 
 	LINUX=${kernelsrc}
 	LINUX_OBJ=${kernelbuild}
 	LINUX_VERSION=${kernsrcver}
 
 	AC_SUBST(LINUX)
 	AC_SUBST(LINUX_OBJ)
 	AC_SUBST(LINUX_VERSION)
 ])
 
 dnl #
 dnl # Detect the QAT module to be built against, QAT provides hardware
 dnl # acceleration for data compression:
 dnl #
 dnl # https://01.org/intel-quickassist-technology
 dnl #
 dnl # 1) Download and install QAT driver from the above link
 dnl # 2) Start QAT driver in your system:
 dnl # 	 service qat_service start
 dnl # 3) Enable QAT in ZFS, e.g.:
 dnl # 	 ./configure --with-qat=<qat-driver-path>/QAT1.6
 dnl # 	 make
 dnl # 4) Set GZIP compression in ZFS dataset:
 dnl # 	 zfs set compression = gzip <dataset>
 dnl #
 dnl # Then the data written to this ZFS pool is compressed by QAT accelerator
 dnl # automatically, and de-compressed by QAT when read from the pool.
 dnl #
 dnl # 1) Get QAT hardware statistics with:
 dnl #	 cat /proc/icp_dh895xcc_dev/qat
 dnl # 2) To disable QAT:
 dnl # 	 insmod zfs.ko zfs_qat_disable=1
 dnl #
 AC_DEFUN([ZFS_AC_QAT], [
 	AC_ARG_WITH([qat],
 		AS_HELP_STRING([--with-qat=PATH],
 		[Path to qat source]),
 		AS_IF([test "$withval" = "yes"],
 			AC_MSG_ERROR([--with-qat=PATH requires a PATH]),
 			[qatsrc="$withval"]))
 
 	AC_ARG_WITH([qat-obj],
 		AS_HELP_STRING([--with-qat-obj=PATH],
 		[Path to qat build objects]),
 		[qatbuild="$withval"])
 
 	AS_IF([test ! -z "${qatsrc}"], [
 		AC_MSG_CHECKING([qat source directory])
 		AC_MSG_RESULT([$qatsrc])
 		QAT_SRC="${qatsrc}/quickassist"
 		AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [
 			AC_MSG_ERROR([
 	*** Please make sure the qat driver package is installed
 	*** and specify the location of the qat source with the
 	*** '--with-qat=PATH' option then try again. Failed to
 	*** find cpa.h in:
 	${QAT_SRC}/include])
 		])
 	])
 
 	AS_IF([test ! -z "${qatsrc}"], [
 		AC_MSG_CHECKING([qat build directory])
 		AS_IF([test -z "$qatbuild"], [
 			qatbuild="${qatsrc}/build"
 		])
 
 		AC_MSG_RESULT([$qatbuild])
 		QAT_OBJ=${qatbuild}
 		AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [
 			AC_MSG_ERROR([
 	*** Please make sure the qat driver is installed then try again.
 	*** Failed to find icp_qa_al.ko or qat_api.ko in:
 	$QAT_OBJ])
 		])
 
 		AC_SUBST(QAT_SRC)
 		AC_SUBST(QAT_OBJ)
 
 		AC_DEFINE(HAVE_QAT, 1,
 		[qat is enabled and existed])
 	])
 
 	dnl #
 	dnl # Detect the name used for the QAT Module.symvers file.
 	dnl #
 	AS_IF([test ! -z "${qatsrc}"], [
 		AC_MSG_CHECKING([qat file for module symbols])
 		QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers
 
 		AS_IF([test -r $QAT_SYMBOLS], [
 			AC_MSG_RESULT([$QAT_SYMBOLS])
 			AC_SUBST(QAT_SYMBOLS)
 		],[
 			AC_MSG_ERROR([
 	*** Please make sure the qat driver is installed then try again.
 	*** Failed to find Module.symvers in:
 	$QAT_SYMBOLS
 			])
 		])
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_CONFTEST_H
 dnl #
 AC_DEFUN([ZFS_LINUX_CONFTEST_H], [
 test -d build/$2 || mkdir -p build/$2
 cat - <<_ACEOF >build/$2/$2.h
 $1
 _ACEOF
 ])
 
 dnl #
 dnl # ZFS_LINUX_CONFTEST_C
 dnl #
 AC_DEFUN([ZFS_LINUX_CONFTEST_C], [
 test -d build/$2 || mkdir -p build/$2
 cat confdefs.h - <<_ACEOF >build/$2/$2.c
 $1
 _ACEOF
 ])
 
 dnl #
 dnl # ZFS_LINUX_CONFTEST_MAKEFILE
 dnl #
 dnl # $1 - test case name
 dnl # $2 - add to top-level Makefile
 dnl # $3 - additional build flags
 dnl #
 AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [
 	test -d build || mkdir -p build
 	test -d build/$1 || mkdir -p build/$1
 
 	file=build/$1/Makefile
 
 	dnl # Example command line to manually build source.
 	cat - <<_ACEOF >$file
 # Example command line to manually build source
 # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1
 
 ccflags-y := -Werror $FRAME_LARGER_THAN
 _ACEOF
 
 	dnl # Additional custom CFLAGS as requested.
 	m4_ifval($3, [echo "ccflags-y += $3" >>$file], [])
 
 	dnl # Test case source
 	echo "obj-m := $1.o" >>$file
 
 	AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], [])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY])
 dnl #
 m4_define([ZFS_LINUX_TEST_PROGRAM], [
 #include <linux/module.h>
 $1
 
 int
 main (void)
 {
 $2
 	;
 	return 0;
 }
 
 MODULE_DESCRIPTION("conftest");
 MODULE_AUTHOR(ZFS_META_AUTHOR);
 MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
 MODULE_LICENSE($3);
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_REMOVE
 dnl #
 dnl # Removes the specified test source and results.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [
 	test -d build/$1 && rm -Rf build/$1
 	test -f build/Makefile && sed '/$1/d' build/Makefile
 ])
 
 dnl #
 dnl # ZFS_LINUX_COMPILE
 dnl #
 dnl # $1 - build dir
 dnl # $2 - test command
 dnl # $3 - pass command
 dnl # $4 - fail command
 dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes'
 dnl # $6 - set KBUILD_MODPOST_WARN='yes'
 dnl #
 dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST}
 dnl #
 AC_DEFUN([ZFS_LINUX_COMPILE], [
 	AC_ARG_VAR([KERNEL_CC], [C compiler for
 		building kernel modules])
 	AC_ARG_VAR([KERNEL_LD], [Linker for
 		building kernel modules])
 	AC_ARG_VAR([KERNEL_LLVM], [Binary option to
 		build kernel modules with LLVM/CLANG toolchain])
 	AC_TRY_COMMAND([
 	    KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6"
 	    make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC}
 	    ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM}
 	    CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES
 	    -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1])
 	AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_COMPILE
 dnl #
 dnl # Perform a full compile excluding the final modpost phase.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [
 	ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [
 		mv $2/Makefile $2/Makefile.compile.$1
 		mv $2/build.log $2/build.log.$1
 	],[
 	        AC_MSG_ERROR([
         *** Unable to compile test source to determine kernel interfaces.])
 	], [yes], [])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_MODPOST
 dnl #
 dnl # Perform a full compile including the modpost phase.  This may
 dnl # be an incremental build if the objects have already been built.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [
 	ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [
 		mv $2/Makefile $2/Makefile.modpost.$1
 		cat $2/build.log >>build/build.log.$1
 	],[
 	        AC_MSG_ERROR([
         *** Unable to modpost test source to determine kernel interfaces.])
 	], [], [yes])
 ])
 
 dnl #
 dnl # Perform the compilation of the test cases in two phases.
 dnl #
 dnl # Phase 1) attempt to build the object files for all of the tests
 dnl #          defined by the ZFS_LINUX_TEST_SRC macro.  But do not
 dnl #          perform the final modpost stage.
 dnl #
 dnl # Phase 2) disable all tests which failed the initial compilation,
 dnl #          then invoke the final modpost step for the remaining tests.
 dnl #
 dnl # This allows us efficiently build the test cases in parallel while
 dnl # remaining resilient to build failures which are expected when
 dnl # detecting the available kernel interfaces.
 dnl #
 dnl # The maximum allowed parallelism can be controlled by setting the
 dnl # TEST_JOBS environment variable.  Otherwise, it default to $(nproc).
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [
 	dnl # Phase 1 - Compilation only, final linking is skipped.
 	ZFS_LINUX_TEST_COMPILE([$1], [build])
 
 	dnl #
 	dnl # Phase 2 - When building external modules disable test cases
 	dnl # which failed to compile and invoke modpost to verify the
 	dnl # final linking.
 	dnl #
 	dnl # Test names suffixed with '_license' call modpost independently
 	dnl # to ensure that a single incompatibility does not result in the
 	dnl # modpost phase exiting early.  This check is not performed on
 	dnl # every symbol since the majority are compatible and doing so
 	dnl # would significantly slow down this phase.
 	dnl #
 	dnl # When configuring for builtin (--enable-linux-builtin)
 	dnl # fake the linking step artificially create the expected .ko
 	dnl # files for tests which did compile.  This is required for
 	dnl # kernels which do not have loadable module support or have
 	dnl # not yet been built.
 	dnl #
 	AS_IF([test "x$enable_linux_builtin" = "xno"], [
 		for dir in $(awk '/^obj-m/ { print [$]3 }' \
 		    build/Makefile.compile.$1); do
 			name=${dir%/}
 			AS_IF([test -f build/$name/$name.o], [
 				AS_IF([test "${name##*_}" = "license"], [
 					ZFS_LINUX_TEST_MODPOST([$1],
 					    [build/$name])
 					echo "obj-n += $dir" >>build/Makefile
 				], [
 					echo "obj-m += $dir" >>build/Makefile
 				])
 			], [
 				echo "obj-n += $dir" >>build/Makefile
 			])
 		done
 
 		ZFS_LINUX_TEST_MODPOST([$1], [build])
 	], [
 		for dir in $(awk '/^obj-m/ { print [$]3 }' \
 		    build/Makefile.compile.$1); do
 			name=${dir%/}
 			AS_IF([test -f build/$name/$name.o], [
 				touch build/$name/$name.ko
 			])
 		done
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_SRC
 dnl #
 dnl # $1 - name
 dnl # $2 - global
 dnl # $3 - source
 dnl # $4 - extra cflags
 dnl # $5 - check license-compatibility
 dnl #
 dnl # Check if the test source is buildable at all and then if it is
 dnl # license compatible.
 dnl #
 dnl # N.B because all of the test cases are compiled in parallel they
 dnl # must never depend on the results of previous tests.  Each test
 dnl # needs to be entirely independent.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_SRC], [
 	ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]],
 	    [["Dual BSD/GPL"]])], [$1])
 	ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4])
 
 	AS_IF([ test -n "$5" ], [
 		ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM(
 		    [[$2]], [[$3]], [[$5]])], [$1_license])
 		ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4])
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_RESULT
 dnl #
 dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC)
 dnl # $2 - run on success (valid .ko generated)
 dnl # $3 - run on failure (unable to compile)
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_RESULT], [
 	AS_IF([test -d build/$1], [
 		AS_IF([test -f build/$1/$1.ko], [$2], [$3])
 	], [
 		AC_MSG_ERROR([
 	*** No matching source for the "$1" test, check that
 	*** both the test source and result macros refer to the same name.
 		])
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_ERROR
 dnl #
 dnl # Generic error message which can be used when none of the expected
 dnl # kernel interfaces were detected.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_ERROR], [
 	AC_MSG_ERROR([
 	*** None of the expected "$1" interfaces were detected.
 	*** This may be because your kernel version is newer than what is
 	*** supported, or you are using a patched custom kernel with
 	*** incompatible modifications.
 	***
 	*** ZFS Version: $ZFS_META_ALIAS
 	*** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_RESULT_SYMBOL
 dnl #
 dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to
 dnl # verify symbol exports, unless --enable-linux-builtin was provided to
 dnl # configure.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [
 	AS_IF([ ! test -f build/$1/$1.ko], [
 		$5
 	], [
 		AS_IF([test "x$enable_linux_builtin" != "xyes"], [
 			ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5])
 		], [
 			$4
 		])
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_COMPILE_IFELSE
 dnl #
 AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [
 	ZFS_LINUX_TEST_REMOVE([conftest])
 
 	m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])])
 	m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])],
 	    [ZFS_LINUX_CONFTEST_H([], [conftest])])
 
 	ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no],
 	    [m4_ifvaln([$5], [-I$PWD/build/conftest], [])])
 	ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], [])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TRY_COMPILE
 dnl #
 dnl # $1 - global
 dnl # $2 - source
 dnl # $3 - run on success (valid .ko generated)
 dnl # $4 - run on failure (unable to compile)
 dnl #
 dnl # When configuring as builtin (--enable-linux-builtin) for kernels
 dnl # without loadable module support (CONFIG_MODULES=n) only the object
 dnl # file is created.  See ZFS_LINUX_TEST_COMPILE_ALL for details.
 dnl #
 AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [
 	AS_IF([test "x$enable_linux_builtin" = "xyes"], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.o], [$3], [$4])
 	], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.ko], [$3], [$4])
 	])
 ])
 
 dnl #
 dnl # ZFS_CHECK_SYMBOL_EXPORT
 dnl #
 dnl # Check if a symbol is exported on not by consulting the symbols
 dnl # file, or optionally the source code.
 dnl #
 AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [
 	grep -q -E '[[[:space:]]]$1[[[:space:]]]' \
 		$LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
 	rc=$?
 	if test $rc -ne 0; then
 		export=0
 		for file in $2; do
 			grep -q -E "EXPORT_SYMBOL.*($1)" \
 				"$LINUX/$file" 2>/dev/null
 			rc=$?
 			if test $rc -eq 0; then
 				export=1
 				break;
 			fi
 		done
 		if test $export -eq 0; then :
 			$4
 		else :
 			$3
 		fi
 	else :
 		$3
 	fi
 ])
 
 dnl #
 dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL
 dnl #
 dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called
 dnl # to verify symbol exports, unless --enable-linux-builtin was provided
 dnl # to configure.
 dnl #
 AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [
 	ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1])
 	if test $rc -ne 0; then :
 		$6
 	else
 		if test "x$enable_linux_builtin" != xyes; then
 			ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1])
 		fi
 		if test $rc -ne 0; then :
 			$6
 		else :
 			$5
 		fi
 	fi
 ])
 
 dnl #
 dnl # ZFS_LINUX_TRY_COMPILE_HEADER
 dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are
 dnl # provided via the fifth parameter
 dnl #
 AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [
 	AS_IF([test "x$enable_linux_builtin" = "xyes"], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.o], [$3], [$4], [$5])
 	], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.ko], [$3], [$4], [$5])
 	])
 ])
 
 dnl #
 dnl # AS_VERSION_COMPARE_LE
 dnl # like AS_VERSION_COMPARE_LE, but runs $3 if (and only if) $1 <= $2
 dnl # AS_VERSION_COMPARE_LE (version-1, version-2, [action-if-less-or-equal], [action-if-greater])
 dnl #
 AC_DEFUN([AS_VERSION_COMPARE_LE], [
 	AS_VERSION_COMPARE([$1], [$2], [$3], [$3], [$4])
 ])
 
 dnl #
 dnl # ZFS_LINUX_REQUIRE_API
 dnl # like ZFS_LINUX_TEST_ERROR, except only fails if the kernel is
 dnl # at least some specified version.
 dnl #
 AC_DEFUN([ZFS_LINUX_REQUIRE_API], [
 	AS_VERSION_COMPARE_LE([$2], [$kernsrcver], [
 		AC_MSG_ERROR([
 		*** None of the expected "$1" interfaces were detected. This
 		*** interface is expected for kernels version "$2" and above.
 		*** This may be because your kernel version is newer than what is
 		*** supported, or you are using a patched custom kernel with
 		*** incompatible modifications.  Newer kernels may have incompatible
 		*** APIs.
 		***
 		*** ZFS Version: $ZFS_META_ALIAS
 		*** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX
 		])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
diff --git a/sys/contrib/openzfs/contrib/bash_completion.d/.gitignore b/sys/contrib/openzfs/contrib/bash_completion.d/.gitignore
index 0fd9cc63af2a..217893a6bd89 100644
--- a/sys/contrib/openzfs/contrib/bash_completion.d/.gitignore
+++ b/sys/contrib/openzfs/contrib/bash_completion.d/.gitignore
@@ -1 +1,2 @@
 /zfs
+/zpool
diff --git a/sys/contrib/openzfs/contrib/bash_completion.d/Makefile.am b/sys/contrib/openzfs/contrib/bash_completion.d/Makefile.am
index 1ec05ed73d2d..d3e6c0e79071 100644
--- a/sys/contrib/openzfs/contrib/bash_completion.d/Makefile.am
+++ b/sys/contrib/openzfs/contrib/bash_completion.d/Makefile.am
@@ -1,5 +1,9 @@
-nodist_bashcompletion_DATA  = %D%/zfs
-SUBSTFILES                 += $(nodist_bashcompletion_DATA)
+nodist_bashcompletion_DATA  = %D%/zfs %D%/zpool
+COMPLETION_FILES            = %D%/zfs
+SUBSTFILES                 += $(COMPLETION_FILES)
 
-SHELLCHECKSCRIPTS   += $(nodist_bashcompletion_DATA)
-$(call SHELLCHECK_OPTS,$(nodist_bashcompletion_DATA)): SHELLCHECK_SHELL = bash
+SHELLCHECKSCRIPTS   += $(COMPLETION_FILES)
+$(call SHELLCHECK_OPTS,$(COMPLETION_FILES)): SHELLCHECK_SHELL = bash
+
+%D%/zpool: %D%/zfs
+	$(LN_S) zfs $@
diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
index b0f398354e4f..6154c4a86331 100644
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/blkdev_compat.h
@@ -1,783 +1,792 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  */
 
 #ifndef _ZFS_BLKDEV_H
 #define	_ZFS_BLKDEV_H
 
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/major.h>
 #include <linux/msdos_fs.h>	/* for SECTOR_* */
 #include <linux/bio.h>
 
 #ifdef HAVE_BLK_MQ
 #include <linux/blk-mq.h>
 #endif
 
 #ifndef HAVE_BLK_QUEUE_FLAG_SET
 static inline void
 blk_queue_flag_set(unsigned int flag, struct request_queue *q)
 {
 	queue_flag_set(flag, q);
 }
 #endif
 
 #ifndef HAVE_BLK_QUEUE_FLAG_CLEAR
 static inline void
 blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
 	queue_flag_clear(flag, q);
 }
 #endif
 
 /*
+ * 6.11 API
+ * Setting the flush flags directly is no longer possible; flush flags are set
+ * on the queue_limits structure and passed to blk_disk_alloc(). In this case
+ * we remove this function entirely.
+ *
  * 4.7 API,
  * The blk_queue_write_cache() interface has replaced blk_queue_flush()
  * interface.  However, the new interface is GPL-only thus we implement
  * our own trivial wrapper when the GPL-only version is detected.
  *
  * 2.6.36 - 4.6 API,
  * The blk_queue_flush() interface has replaced blk_queue_ordered()
  * interface.  However, while the old interface was available to all the
  * new one is GPL-only.   Thus if the GPL-only version is detected we
  * implement our own trivial helper.
  */
+#if !defined(HAVE_BLK_ALLOC_DISK_2ARG) || \
+	!defined(HAVE_BLKDEV_QUEUE_LIMITS_FEATURES)
 static inline void
-blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua)
+blk_queue_set_write_cache(struct request_queue *q, bool on)
 {
 #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY)
-	if (wc)
+	if (on) {
 		blk_queue_flag_set(QUEUE_FLAG_WC, q);
-	else
-		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
-	if (fua)
 		blk_queue_flag_set(QUEUE_FLAG_FUA, q);
-	else
+	} else {
+		blk_queue_flag_clear(QUEUE_FLAG_WC, q);
 		blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
+	}
 #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE)
-	blk_queue_write_cache(q, wc, fua);
+	blk_queue_write_cache(q, on, on);
 #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY)
-	if (wc)
-		q->flush_flags |= REQ_FLUSH;
-	if (fua)
-		q->flush_flags |= REQ_FUA;
+	if (on)
+		q->flush_flags |= REQ_FLUSH | REQ_FUA;
+	else
+		q->flush_flags &= ~(REQ_FLUSH | REQ_FUA);
 #elif defined(HAVE_BLK_QUEUE_FLUSH)
-	blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0));
+	blk_queue_flush(q, on ? (REQ_FLUSH | REQ_FUA) : 0);
 #else
 #error "Unsupported kernel"
 #endif
 }
+#endif /* !HAVE_BLK_ALLOC_DISK_2ARG || !HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */
 
 static inline void
 blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages)
 {
 #if !defined(HAVE_BLK_QUEUE_UPDATE_READAHEAD) && \
 	!defined(HAVE_DISK_UPDATE_READAHEAD)
-#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC
+#if defined(HAVE_BLK_QUEUE_BDI_DYNAMIC)
 	q->backing_dev_info->ra_pages = ra_pages;
+#elif defined(HAVE_BLK_QUEUE_DISK_BDI)
+	q->disk->bdi->ra_pages = ra_pages;
 #else
 	q->backing_dev_info.ra_pages = ra_pages;
 #endif
 #endif
 }
 
 #ifdef HAVE_BIO_BVEC_ITER
 #define	BIO_BI_SECTOR(bio)	(bio)->bi_iter.bi_sector
 #define	BIO_BI_SIZE(bio)	(bio)->bi_iter.bi_size
 #define	BIO_BI_IDX(bio)		(bio)->bi_iter.bi_idx
 #define	BIO_BI_SKIP(bio)	(bio)->bi_iter.bi_bvec_done
 #define	bio_for_each_segment4(bv, bvp, b, i)	\
 	bio_for_each_segment((bv), (b), (i))
 typedef struct bvec_iter bvec_iterator_t;
 #else
 #define	BIO_BI_SECTOR(bio)	(bio)->bi_sector
 #define	BIO_BI_SIZE(bio)	(bio)->bi_size
 #define	BIO_BI_IDX(bio)		(bio)->bi_idx
 #define	BIO_BI_SKIP(bio)	(0)
 #define	bio_for_each_segment4(bv, bvp, b, i)	\
 	bio_for_each_segment((bvp), (b), (i))
 typedef int bvec_iterator_t;
 #endif
 
 static inline void
 bio_set_flags_failfast(struct block_device *bdev, int *flags, bool dev,
     bool transport, bool driver)
 {
 #ifdef CONFIG_BUG
 	/*
 	 * Disable FAILFAST for loopback devices because of the
 	 * following incorrect BUG_ON() in loop_make_request().
 	 * This support is also disabled for md devices because the
 	 * test suite layers md devices on top of loopback devices.
 	 * This may be removed when the loopback driver is fixed.
 	 *
 	 *   BUG_ON(!lo || (rw != READ && rw != WRITE));
 	 */
 	if ((MAJOR(bdev->bd_dev) == LOOP_MAJOR) ||
 	    (MAJOR(bdev->bd_dev) == MD_MAJOR))
 		return;
 
 #ifdef BLOCK_EXT_MAJOR
 	if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
 		return;
 #endif /* BLOCK_EXT_MAJOR */
 #endif /* CONFIG_BUG */
 
 	if (dev)
 		*flags |= REQ_FAILFAST_DEV;
 	if (transport)
 		*flags |= REQ_FAILFAST_TRANSPORT;
 	if (driver)
 		*flags |= REQ_FAILFAST_DRIVER;
 }
 
 /*
  * Maximum disk label length, it may be undefined for some kernels.
  */
 #if !defined(DISK_NAME_LEN)
 #define	DISK_NAME_LEN	32
 #endif /* DISK_NAME_LEN */
 
 #ifdef HAVE_BIO_BI_STATUS
 static inline int
 bi_status_to_errno(blk_status_t status)
 {
 	switch (status)	{
 	case BLK_STS_OK:
 		return (0);
 	case BLK_STS_NOTSUPP:
 		return (EOPNOTSUPP);
 	case BLK_STS_TIMEOUT:
 		return (ETIMEDOUT);
 	case BLK_STS_NOSPC:
 		return (ENOSPC);
 	case BLK_STS_TRANSPORT:
 		return (ENOLINK);
 	case BLK_STS_TARGET:
 		return (EREMOTEIO);
 #ifdef HAVE_BLK_STS_RESV_CONFLICT
 	case BLK_STS_RESV_CONFLICT:
 #else
 	case BLK_STS_NEXUS:
 #endif
 		return (EBADE);
 	case BLK_STS_MEDIUM:
 		return (ENODATA);
 	case BLK_STS_PROTECTION:
 		return (EILSEQ);
 	case BLK_STS_RESOURCE:
 		return (ENOMEM);
 	case BLK_STS_AGAIN:
 		return (EAGAIN);
 	case BLK_STS_IOERR:
 		return (EIO);
 	default:
 		return (EIO);
 	}
 }
 
 static inline blk_status_t
 errno_to_bi_status(int error)
 {
 	switch (error) {
 	case 0:
 		return (BLK_STS_OK);
 	case EOPNOTSUPP:
 		return (BLK_STS_NOTSUPP);
 	case ETIMEDOUT:
 		return (BLK_STS_TIMEOUT);
 	case ENOSPC:
 		return (BLK_STS_NOSPC);
 	case ENOLINK:
 		return (BLK_STS_TRANSPORT);
 	case EREMOTEIO:
 		return (BLK_STS_TARGET);
 	case EBADE:
 #ifdef HAVE_BLK_STS_RESV_CONFLICT
 		return (BLK_STS_RESV_CONFLICT);
 #else
 		return (BLK_STS_NEXUS);
 #endif
 	case ENODATA:
 		return (BLK_STS_MEDIUM);
 	case EILSEQ:
 		return (BLK_STS_PROTECTION);
 	case ENOMEM:
 		return (BLK_STS_RESOURCE);
 	case EAGAIN:
 		return (BLK_STS_AGAIN);
 	case EIO:
 		return (BLK_STS_IOERR);
 	default:
 		return (BLK_STS_IOERR);
 	}
 }
 #endif /* HAVE_BIO_BI_STATUS */
 
 /*
  * 4.3 API change
  * The bio_endio() prototype changed slightly.  These are helper
  * macro's to ensure the prototype and invocation are handled.
  */
 #ifdef HAVE_1ARG_BIO_END_IO_T
 #ifdef HAVE_BIO_BI_STATUS
 #define	BIO_END_IO_ERROR(bio)		bi_status_to_errno(bio->bi_status)
 #define	BIO_END_IO_PROTO(fn, x, z)	static void fn(struct bio *x)
 #define	BIO_END_IO(bio, error)		bio_set_bi_status(bio, error)
 static inline void
 bio_set_bi_status(struct bio *bio, int error)
 {
 	ASSERT3S(error, <=, 0);
 	bio->bi_status = errno_to_bi_status(-error);
 	bio_endio(bio);
 }
 #else
 #define	BIO_END_IO_ERROR(bio)		(-(bio->bi_error))
 #define	BIO_END_IO_PROTO(fn, x, z)	static void fn(struct bio *x)
 #define	BIO_END_IO(bio, error)		bio_set_bi_error(bio, error)
 static inline void
 bio_set_bi_error(struct bio *bio, int error)
 {
 	ASSERT3S(error, <=, 0);
 	bio->bi_error = error;
 	bio_endio(bio);
 }
 #endif /* HAVE_BIO_BI_STATUS */
 
 #else
 #define	BIO_END_IO_PROTO(fn, x, z)	static void fn(struct bio *x, int z)
 #define	BIO_END_IO(bio, error)		bio_endio(bio, error);
 #endif /* HAVE_1ARG_BIO_END_IO_T */
 
 /*
  * 5.15 MACRO,
  *   GD_DEAD
  *
  * 2.6.36 - 5.14 MACRO,
  *   GENHD_FL_UP
  *
  * Check the disk status and return B_TRUE if alive
  * otherwise B_FALSE
  */
 static inline boolean_t
 zfs_check_disk_status(struct block_device *bdev)
 {
 #if defined(GENHD_FL_UP)
 	return (!!(bdev->bd_disk->flags & GENHD_FL_UP));
 #elif defined(GD_DEAD)
 	return (!test_bit(GD_DEAD, &bdev->bd_disk->state));
 #else
 /*
  * This is encountered if neither GENHD_FL_UP nor GD_DEAD is available in
  * the kernel - likely due to an MACRO change that needs to be chased down.
  */
 #error "Unsupported kernel: no usable disk status check"
 #endif
 }
 
 /*
  * 4.1 API,
  * 3.10.0 CentOS 7.x API,
  *   blkdev_reread_part()
  *
  * For older kernels trigger a re-reading of the partition table by calling
  * check_disk_change() which calls flush_disk() to invalidate the device.
  *
  * For newer kernels (as of 5.10), bdev_check_media_change is used, in favor of
  * check_disk_change(), with the modification that invalidation is no longer
  * forced.
  */
 #ifdef HAVE_CHECK_DISK_CHANGE
 #define	zfs_check_media_change(bdev)	check_disk_change(bdev)
 #ifdef HAVE_BLKDEV_REREAD_PART
 #define	vdev_bdev_reread_part(bdev)	blkdev_reread_part(bdev)
 #else
 #define	vdev_bdev_reread_part(bdev)	check_disk_change(bdev)
 #endif /* HAVE_BLKDEV_REREAD_PART */
 #else
 #ifdef HAVE_BDEV_CHECK_MEDIA_CHANGE
 static inline int
 zfs_check_media_change(struct block_device *bdev)
 {
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 	struct gendisk *gd = bdev->bd_disk;
 	const struct block_device_operations *bdo = gd->fops;
 #endif
 
 	if (!bdev_check_media_change(bdev))
 		return (0);
 
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 	/*
 	 * Force revalidation, to mimic the old behavior of
 	 * check_disk_change()
 	 */
 	if (bdo->revalidate_disk)
 		bdo->revalidate_disk(gd);
 #endif
 
 	return (0);
 }
 #define	vdev_bdev_reread_part(bdev)	zfs_check_media_change(bdev)
 #elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE)
 #define	vdev_bdev_reread_part(bdev)	disk_check_media_change(bdev->bd_disk)
 #define	zfs_check_media_change(bdev)	disk_check_media_change(bdev->bd_disk)
 #else
 /*
  * This is encountered if check_disk_change() and bdev_check_media_change()
  * are not available in the kernel - likely due to an API change that needs
  * to be chased down.
  */
 #error "Unsupported kernel: no usable disk change check"
 #endif /* HAVE_BDEV_CHECK_MEDIA_CHANGE */
 #endif /* HAVE_CHECK_DISK_CHANGE */
 
 /*
  * 2.6.27 API change
  * The function was exported for use, prior to this it existed but the
  * symbol was not exported.
  *
  * 4.4.0-6.21 API change for Ubuntu
  * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions.
  *
  * 5.11 API change
  * Changed to take a dev_t argument which is set on success and return a
  * non-zero error code on failure.
  */
 static inline int
 vdev_lookup_bdev(const char *path, dev_t *dev)
 {
 #if defined(HAVE_DEVT_LOOKUP_BDEV)
 	return (lookup_bdev(path, dev));
 #elif defined(HAVE_1ARG_LOOKUP_BDEV)
 	struct block_device *bdev = lookup_bdev(path);
 	if (IS_ERR(bdev))
 		return (PTR_ERR(bdev));
 
 	*dev = bdev->bd_dev;
 	bdput(bdev);
 
 	return (0);
 #elif defined(HAVE_MODE_LOOKUP_BDEV)
 	struct block_device *bdev = lookup_bdev(path, FMODE_READ);
 	if (IS_ERR(bdev))
 		return (PTR_ERR(bdev));
 
 	*dev = bdev->bd_dev;
 	bdput(bdev);
 
 	return (0);
 #else
 #error "Unsupported kernel"
 #endif
 }
 
 #if defined(HAVE_BLK_MODE_T)
 #define	blk_mode_is_open_write(flag)	((flag) & BLK_OPEN_WRITE)
 #else
 #define	blk_mode_is_open_write(flag)	((flag) & FMODE_WRITE)
 #endif
 
 /*
  * Kernels without bio_set_op_attrs use bi_rw for the bio flags.
  */
 #if !defined(HAVE_BIO_SET_OP_ATTRS)
 static inline void
 bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags)
 {
 #if defined(HAVE_BIO_BI_OPF)
 	bio->bi_opf = rw | flags;
 #else
 	bio->bi_rw |= rw | flags;
 #endif /* HAVE_BIO_BI_OPF */
 }
 #endif
 
 /*
  * bio_set_flush - Set the appropriate flags in a bio to guarantee
  * data are on non-volatile media on completion.
  *
  * 2.6.37 - 4.8 API,
  *   Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a
  *   replacement for WRITE_BARRIER to allow expressing richer semantics
  *   to the block layer.  It's up to the block layer to implement the
  *   semantics correctly. Use the WRITE_FLUSH_FUA flag combination.
  *
  * 4.8 - 4.9 API,
  *   REQ_FLUSH was renamed to REQ_PREFLUSH.  For consistency with previous
  *   OpenZFS releases, prefer the WRITE_FLUSH_FUA flag set if it's available.
  *
  * 4.10 API,
  *   The read/write flags and their modifiers, including WRITE_FLUSH,
  *   WRITE_FUA and WRITE_FLUSH_FUA were removed from fs.h in
  *   torvalds/linux@70fd7614 and replaced by direct flag modification
  *   of the REQ_ flags in bio->bi_opf.  Use REQ_PREFLUSH.
  */
 static inline void
 bio_set_flush(struct bio *bio)
 {
 #if defined(HAVE_REQ_PREFLUSH)	/* >= 4.10 */
 	bio_set_op_attrs(bio, 0, REQ_PREFLUSH | REQ_OP_WRITE);
 #elif defined(WRITE_FLUSH_FUA)	/* >= 2.6.37 and <= 4.9 */
 	bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA);
 #else
 #error	"Allowing the build will cause bio_set_flush requests to be ignored."
 #endif
 }
 
 /*
  * 4.8 API,
  *   REQ_OP_FLUSH
  *
  * 4.8-rc0 - 4.8-rc1,
  *   REQ_PREFLUSH
  *
  * 2.6.36 - 4.7 API,
  *   REQ_FLUSH
  *
  * in all cases but may have a performance impact for some kernels.  It
  * has the advantage of minimizing kernel specific changes in the zvol code.
  *
  */
 static inline boolean_t
 bio_is_flush(struct bio *bio)
 {
 #if defined(HAVE_REQ_OP_FLUSH) && defined(HAVE_BIO_BI_OPF)
 	return ((bio_op(bio) == REQ_OP_FLUSH) || (bio->bi_opf & REQ_PREFLUSH));
 #elif defined(HAVE_REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF)
 	return (bio->bi_opf & REQ_PREFLUSH);
 #elif defined(HAVE_REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF)
 	return (bio->bi_rw & REQ_PREFLUSH);
 #elif defined(HAVE_REQ_FLUSH)
 	return (bio->bi_rw & REQ_FLUSH);
 #else
 #error	"Unsupported kernel"
 #endif
 }
 
 /*
  * 4.8 API,
  *   REQ_FUA flag moved to bio->bi_opf
  *
  * 2.6.x - 4.7 API,
  *   REQ_FUA
  */
 static inline boolean_t
 bio_is_fua(struct bio *bio)
 {
 #if defined(HAVE_BIO_BI_OPF)
 	return (bio->bi_opf & REQ_FUA);
 #elif defined(REQ_FUA)
 	return (bio->bi_rw & REQ_FUA);
 #else
 #error	"Allowing the build will cause fua requests to be ignored."
 #endif
 }
 
 /*
  * 4.8 API,
  *   REQ_OP_DISCARD
  *
  * 2.6.36 - 4.7 API,
  *   REQ_DISCARD
  *
  * In all cases the normal I/O path is used for discards.  The only
  * difference is how the kernel tags individual I/Os as discards.
  */
 static inline boolean_t
 bio_is_discard(struct bio *bio)
 {
 #if defined(HAVE_REQ_OP_DISCARD)
 	return (bio_op(bio) == REQ_OP_DISCARD);
 #elif defined(HAVE_REQ_DISCARD)
 	return (bio->bi_rw & REQ_DISCARD);
 #else
 #error "Unsupported kernel"
 #endif
 }
 
 /*
  * 4.8 API,
  *   REQ_OP_SECURE_ERASE
  *
  * 2.6.36 - 4.7 API,
  *   REQ_SECURE
  */
 static inline boolean_t
 bio_is_secure_erase(struct bio *bio)
 {
 #if defined(HAVE_REQ_OP_SECURE_ERASE)
 	return (bio_op(bio) == REQ_OP_SECURE_ERASE);
 #elif defined(REQ_SECURE)
 	return (bio->bi_rw & REQ_SECURE);
 #else
 	return (0);
 #endif
 }
 
 /*
  * 2.6.33 API change
  * Discard granularity and alignment restrictions may now be set.  For
  * older kernels which do not support this it is safe to skip it.
  */
 static inline void
 blk_queue_discard_granularity(struct request_queue *q, unsigned int dg)
 {
 	q->limits.discard_granularity = dg;
 }
 
 /*
  * 5.19 API,
  *   bdev_max_discard_sectors()
  *
  * 2.6.32 API,
  *   blk_queue_discard()
  */
 static inline boolean_t
 bdev_discard_supported(struct block_device *bdev)
 {
 #if defined(HAVE_BDEV_MAX_DISCARD_SECTORS)
 	return (bdev_max_discard_sectors(bdev) > 0 &&
 	    bdev_discard_granularity(bdev) > 0);
 #elif defined(HAVE_BLK_QUEUE_DISCARD)
 	return (blk_queue_discard(bdev_get_queue(bdev)) > 0 &&
 	    bdev_get_queue(bdev)->limits.discard_granularity > 0);
 #else
 #error "Unsupported kernel"
 #endif
 }
 
 /*
  * 5.19 API,
  *   bdev_max_secure_erase_sectors()
  *
  * 4.8 API,
  *   blk_queue_secure_erase()
  *
  * 2.6.36 - 4.7 API,
  *   blk_queue_secdiscard()
  */
 static inline boolean_t
 bdev_secure_discard_supported(struct block_device *bdev)
 {
 #if defined(HAVE_BDEV_MAX_SECURE_ERASE_SECTORS)
 	return (!!bdev_max_secure_erase_sectors(bdev));
 #elif defined(HAVE_BLK_QUEUE_SECURE_ERASE)
 	return (!!blk_queue_secure_erase(bdev_get_queue(bdev)));
 #elif defined(HAVE_BLK_QUEUE_SECDISCARD)
 	return (!!blk_queue_secdiscard(bdev_get_queue(bdev)));
 #else
 #error "Unsupported kernel"
 #endif
 }
 
 /*
  * A common holder for vdev_bdev_open() is used to relax the exclusive open
  * semantics slightly.  Internal vdev disk callers may pass VDEV_HOLDER to
  * allow them to open the device multiple times.  Other kernel callers and
  * user space processes which don't pass this value will get EBUSY.  This is
  * currently required for the correct operation of hot spares.
  */
 #define	VDEV_HOLDER			((void *)0x2401de7)
 
 static inline unsigned long
 blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)),
     struct gendisk *disk __attribute__((unused)),
     int rw __attribute__((unused)), struct bio *bio)
 {
 #if defined(HAVE_BDEV_IO_ACCT_63)
 	return (bdev_start_io_acct(bio->bi_bdev, bio_op(bio),
 	    jiffies));
 #elif defined(HAVE_BDEV_IO_ACCT_OLD)
 	return (bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
 	    bio_op(bio), jiffies));
 #elif defined(HAVE_DISK_IO_ACCT)
 	return (disk_start_io_acct(disk, bio_sectors(bio), bio_op(bio)));
 #elif defined(HAVE_BIO_IO_ACCT)
 	return (bio_start_io_acct(bio));
 #elif defined(HAVE_GENERIC_IO_ACCT_3ARG)
 	unsigned long start_time = jiffies;
 	generic_start_io_acct(rw, bio_sectors(bio), &disk->part0);
 	return (start_time);
 #elif defined(HAVE_GENERIC_IO_ACCT_4ARG)
 	unsigned long start_time = jiffies;
 	generic_start_io_acct(q, rw, bio_sectors(bio), &disk->part0);
 	return (start_time);
 #else
 	/* Unsupported */
 	return (0);
 #endif
 }
 
 static inline void
 blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)),
     struct gendisk *disk __attribute__((unused)),
     int rw __attribute__((unused)), struct bio *bio, unsigned long start_time)
 {
 #if defined(HAVE_BDEV_IO_ACCT_63)
 	bdev_end_io_acct(bio->bi_bdev, bio_op(bio), bio_sectors(bio),
 	    start_time);
 #elif defined(HAVE_BDEV_IO_ACCT_OLD)
 	bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time);
 #elif defined(HAVE_DISK_IO_ACCT)
 	disk_end_io_acct(disk, bio_op(bio), start_time);
 #elif defined(HAVE_BIO_IO_ACCT)
 	bio_end_io_acct(bio, start_time);
 #elif defined(HAVE_GENERIC_IO_ACCT_3ARG)
 	generic_end_io_acct(rw, &disk->part0, start_time);
 #elif defined(HAVE_GENERIC_IO_ACCT_4ARG)
 	generic_end_io_acct(q, rw, &disk->part0, start_time);
 #endif
 }
 
 #ifndef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 static inline struct request_queue *
 blk_generic_alloc_queue(make_request_fn make_request, int node_id)
 {
 #if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN)
 	return (blk_alloc_queue(make_request, node_id));
 #elif defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH)
 	return (blk_alloc_queue_rh(make_request, node_id));
 #else
 	struct request_queue *q = blk_alloc_queue(GFP_KERNEL);
 	if (q != NULL)
 		blk_queue_make_request(q, make_request);
 
 	return (q);
 #endif
 }
 #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 
 /*
  * All the io_*() helper functions below can operate on a bio, or a rq, but
  * not both.  The older submit_bio() codepath will pass a bio, and the
  * newer blk-mq codepath will pass a rq.
  */
 static inline int
 io_data_dir(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL) {
 		if (op_is_write(req_op(rq))) {
 			return (WRITE);
 		} else {
 			return (READ);
 		}
 	}
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (bio_data_dir(bio));
 }
 
 static inline int
 io_is_flush(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (req_op(rq) == REQ_OP_FLUSH);
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (bio_is_flush(bio));
 }
 
 static inline int
 io_is_discard(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (req_op(rq) == REQ_OP_DISCARD);
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (bio_is_discard(bio));
 }
 
 static inline int
 io_is_secure_erase(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (req_op(rq) == REQ_OP_SECURE_ERASE);
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (bio_is_secure_erase(bio));
 }
 
 static inline int
 io_is_fua(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (rq->cmd_flags & REQ_FUA);
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (bio_is_fua(bio));
 }
 
 
 static inline uint64_t
 io_offset(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (blk_rq_pos(rq) << 9);
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (BIO_BI_SECTOR(bio) << 9);
 }
 
 static inline uint64_t
 io_size(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (blk_rq_bytes(rq));
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (BIO_BI_SIZE(bio));
 }
 
 static inline int
 io_has_data(struct bio *bio, struct request *rq)
 {
 #ifdef HAVE_BLK_MQ
 	if (rq != NULL)
 		return (bio_has_data(rq->bio));
 #else
 	ASSERT3P(rq, ==, NULL);
 #endif
 	return (bio_has_data(bio));
 }
 #endif /* _ZFS_BLKDEV_H */
diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/mm_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/mm_compat.h
index 40056c68d6dd..817f6df422de 100644
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/mm_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/mm_compat.h
@@ -1,36 +1,43 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2023, 2024, Klara Inc.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
  */
 
 #ifndef _ZFS_MM_COMPAT_H
 #define	_ZFS_MM_COMPAT_H
 
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 
 /* 5.4 introduced page_size(). Older kernels can use a trivial macro instead */
 #ifndef HAVE_MM_PAGE_SIZE
 #define	page_size(p) ((unsigned long)(PAGE_SIZE << compound_order(p)))
 #endif
 
+/* 6.11 removed page_mapping(). A simple wrapper around folio_mapping() works */
+#ifndef HAVE_MM_PAGE_MAPPING
+#define	page_mapping(p) folio_mapping(page_folio(p))
+#endif
+
 #endif /* _ZFS_MM_COMPAT_H */
diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4
index 717cf8e6eae8..c17bfb80bf09 100644
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@@ -1,2687 +1,2693 @@
 .\"
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
 .\" Copyright (c) 2023, 2024 Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .Dd January 9, 2024
 .Dt ZFS 4
 .Os
 .
 .Sh NAME
 .Nm zfs
 .Nd tuning of the ZFS kernel module
 .
 .Sh DESCRIPTION
 The ZFS module supports these parameters:
 .Bl -tag -width Ds
 .It Sy dbuf_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64
 Maximum size in bytes of the dbuf cache.
 The target size is determined by the MIN versus
 .No 1/2^ Ns Sy dbuf_cache_shift Pq 1/32nd
 of the target ARC size.
 The behavior of the dbuf cache and its associated settings
 can be observed via the
 .Pa /proc/spl/kstat/zfs/dbufstats
 kstat.
 .
 .It Sy dbuf_metadata_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64
 Maximum size in bytes of the metadata dbuf cache.
 The target size is determined by the MIN versus
 .No 1/2^ Ns Sy dbuf_metadata_cache_shift Pq 1/64th
 of the target ARC size.
 The behavior of the metadata dbuf cache and its associated settings
 can be observed via the
 .Pa /proc/spl/kstat/zfs/dbufstats
 kstat.
 .
 .It Sy dbuf_cache_hiwater_pct Ns = Ns Sy 10 Ns % Pq uint
 The percentage over
 .Sy dbuf_cache_max_bytes
 when dbufs must be evicted directly.
 .
 .It Sy dbuf_cache_lowater_pct Ns = Ns Sy 10 Ns % Pq uint
 The percentage below
 .Sy dbuf_cache_max_bytes
 when the evict thread stops evicting dbufs.
 .
 .It Sy dbuf_cache_shift Ns = Ns Sy 5 Pq uint
 Set the size of the dbuf cache
 .Pq Sy dbuf_cache_max_bytes
 to a log2 fraction of the target ARC size.
 .
 .It Sy dbuf_metadata_cache_shift Ns = Ns Sy 6 Pq uint
 Set the size of the dbuf metadata cache
 .Pq Sy dbuf_metadata_cache_max_bytes
 to a log2 fraction of the target ARC size.
 .
 .It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint
 Set the size of the mutex array for the dbuf cache.
 When set to
 .Sy 0
 the array is dynamically sized based on total system memory.
 .
 .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
 .
 .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Limit the amount we can prefetch with one call to this amount in bytes.
 This helps to limit the amount of memory that can be used by prefetching.
 .
 .It Sy ignore_hole_birth Pq int
 Alias for
 .Sy send_holes_without_birth_time .
 .
 .It Sy l2arc_feed_again Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Turbo L2ARC warm-up.
 When the L2ARC is cold the fill interval will be set as fast as possible.
 .
 .It Sy l2arc_feed_min_ms Ns = Ns Sy 200 Pq u64
 Min feed interval in milliseconds.
 Requires
 .Sy l2arc_feed_again Ns = Ns Ar 1
 and only applicable in related situations.
 .
 .It Sy l2arc_feed_secs Ns = Ns Sy 1 Pq u64
 Seconds between L2ARC writing.
 .
 .It Sy l2arc_headroom Ns = Ns Sy 2 Pq u64
 How far through the ARC lists to search for L2ARC cacheable content,
 expressed as a multiplier of
 .Sy l2arc_write_max .
 ARC persistence across reboots can be achieved with persistent L2ARC
 by setting this parameter to
 .Sy 0 ,
 allowing the full length of ARC lists to be searched for cacheable content.
 .
 .It Sy l2arc_headroom_boost Ns = Ns Sy 200 Ns % Pq u64
 Scales
 .Sy l2arc_headroom
 by this percentage when L2ARC contents are being successfully compressed
 before writing.
 A value of
 .Sy 100
 disables this feature.
 .
 .It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
-.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Pq  int
+.It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
-The default is off,
+The default is 0,
 meaning both MRU and MFU data and metadata are cached.
-When turning off this feature, some MRU buffers will still be present
-in ARC and eventually cached on L2ARC.
+When turning off this feature (setting it to 0), some MRU buffers will
+still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
+Setting it to 1 means to L2 cache only MFU data and metadata.
+.Pp
+Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
+only MFU data (ie: MRU data are not cached). This can be the right setting
+to cache as much metadata as possible even when having high data turnover.
+.Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
 accessed later on as prefetches and transition to MRU as prefetches.
 If accessed again they are counted as MRU and the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
 The ARC status of L2ARC buffers when they were first cached in
 L2ARC can be seen in the
 .Sy l2arc_mru_asize , Sy l2arc_mfu_asize , No and Sy l2arc_prefetch_asize
 arcstats when importing the pool or onlining a cache
 device if persistent L2ARC is enabled.
 .Pp
 The
 .Sy evict_l2_eligible_mru
 arcstat does not take into account if this option is enabled as the information
 provided by the
 .Sy evict_l2_eligible_m[rf]u
 arcstats can be used to decide if toggling this option is appropriate
 for the current workload.
 .
 .It Sy l2arc_meta_percent Ns = Ns Sy 33 Ns % Pq uint
 Percent of ARC size allowed for L2ARC-only headers.
 Since L2ARC buffers are not evicted on memory pressure,
 too many headers on a system with an irrationally large L2ARC
 can render it slow or unusable.
 This parameter limits L2ARC writes and rebuilds to achieve the target.
 .
 .It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq u64
 Trims ahead of the current write size
 .Pq Sy l2arc_write_max
 on L2ARC devices by this percentage of write size if we have filled the device.
 If set to
 .Sy 100
 we TRIM twice the space required to accommodate upcoming writes.
 A minimum of
 .Sy 64 MiB
 will be trimmed.
 It also enables TRIM of the whole L2ARC device upon creation
 or addition to an existing pool or if the header of the device is
 invalid upon importing a pool or onlining a cache device.
 A value of
 .Sy 0
 disables TRIM on L2ARC altogether and is the default as it can put significant
 stress on the underlying storage devices.
 This will vary depending of how well the specific device handles these commands.
 .
 .It Sy l2arc_noprefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Do not write buffers to L2ARC if they were prefetched but not used by
 applications.
 In case there are prefetched buffers in L2ARC and this option
 is later set, we do not read the prefetched buffers from L2ARC.
 Unsetting this option is useful for caching sequential reads from the
 disks to L2ARC and serve those reads from L2ARC later on.
 This may be beneficial in case the L2ARC device is significantly faster
 in sequential reads than the disks of the pool.
 .Pp
 Use
 .Sy 1
 to disable and
 .Sy 0
 to enable caching/reading prefetches to/from L2ARC.
 .
 .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int
 No reads during writes.
 .
 .It Sy l2arc_write_boost Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq u64
 Cold L2ARC devices will have
 .Sy l2arc_write_max
 increased by this amount while they remain cold.
 .
 .It Sy l2arc_write_max Ns = Ns Sy 8388608 Ns B Po 8 MiB Pc Pq u64
 Max write bytes per interval.
 .
 .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Rebuild the L2ARC when importing a pool (persistent L2ARC).
 This can be disabled if there are problems importing a pool
 or attaching an L2ARC device (e.g. the L2ARC device is slow
 in reading stored log metadata, or the metadata
 has become somehow fragmented/unusable).
 .
 .It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
 Mininum size of an L2ARC device required in order to write log blocks in it.
 The log blocks are used upon importing the pool to rebuild the persistent L2ARC.
 .Pp
 For L2ARC devices less than 1 GiB, the amount of data
 .Fn l2arc_evict
 evicts is significant compared to the amount of restored L2ARC data.
 In this case, do not write log blocks in L2ARC in order not to waste space.
 .
 .It Sy metaslab_aliquot Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 Metaslab granularity, in bytes.
 This is roughly similar to what would be referred to as the "stripe size"
 in traditional RAID arrays.
 In normal operation, ZFS will try to write this amount of data to each disk
 before moving on to the next top-level vdev.
 .
 .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable metaslab group biasing based on their vdevs' over- or under-utilization
 relative to the pool.
 .
 .It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
 .It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
 For blocks that could be forced to be a gang block (due to
 .Sy metaslab_force_ganging ) ,
 force this many of them to be gang blocks.
 .
 .It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Controls prefetching BRT records for blocks which are going to be cloned.
 .
 .It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
 Default BRT ZAP data block size as a power of 2. Note that changing this after
 creating a BRT on the pool will not affect existing BRTs, only newly created
 ones.
 .
 .It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
 Default BRT ZAP indirect block size as a power of 2. Note that changing this
 after creating a BRT on the pool will not affect existing BRTs, only newly
 created ones.
 .
 .It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
 ones.
 .
 .It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP indirect block size as a power of 2. Note that changing this
 after creating a DDT on the pool will not affect existing DDTs, only newly
 created ones.
 .
 .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int
 Default dnode block size as a power of 2.
 .
 .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int
 Default dnode indirect block size as a power of 2.
 .
 .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 When attempting to log an output nvlist of an ioctl in the on-disk history,
 the output will not be stored if it is larger than this size (in bytes).
 This must be less than
 .Sy DMU_MAX_ACCESS Pq 64 MiB .
 This applies primarily to
 .Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 .
 .
 .It Sy zfs_keep_log_spacemaps_at_export Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prevent log spacemaps from being destroyed during pool exports and destroys.
 .
 .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable/disable segment-based metaslab selection.
 .
 .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int
 When using segment-based metaslab selection, continue allocating
 from the active metaslab until this option's
 worth of buckets have been exhausted.
 .
 .It Sy metaslab_debug_load Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Load all metaslabs during pool import.
 .
 .It Sy metaslab_debug_unload Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prevent metaslabs from being unloaded.
 .
 .It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable use of the fragmentation metric in computing metaslab weights.
 .
 .It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 Maximum distance to search forward from the last offset.
 Without this limit, fragmented pools can see
 .Em >100`000
 iterations and
 .Fn metaslab_block_picker
 becomes the performance limiting factor on high-performance storage.
 .Pp
 With the default setting of
 .Sy 16 MiB ,
 we typically see less than
 .Em 500
 iterations, even with very fragmented
 .Sy ashift Ns = Ns Sy 9
 pools.
 The maximum number of iterations possible is
 .Sy metaslab_df_max_search / 2^(ashift+1) .
 With the default setting of
 .Sy 16 MiB
 this is
 .Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9
 or
 .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 .
 .
 .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If not searching forward (due to
 .Sy metaslab_df_max_search , metaslab_df_free_pct ,
 .No or Sy metaslab_df_alloc_threshold ) ,
 this tunable controls which segment is used.
 If set, we will use the largest free segment.
 If unset, we will use a segment of at least the requested size.
 .
 .It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1 hour Pc Pq u64
 When we unload a metaslab, we cache the size of the largest free chunk.
 We use that cached size to determine whether or not to load a metaslab
 for a given allocation.
 As more frees accumulate in that metaslab while it's unloaded,
 the cached max size becomes less and less accurate.
 After a number of seconds controlled by this tunable,
 we stop considering the cached max size and start
 considering only the histogram instead.
 .
 .It Sy zfs_metaslab_mem_limit Ns = Ns Sy 25 Ns % Pq uint
 When we are loading a new metaslab, we check the amount of memory being used
 to store metaslab range trees.
 If it is over a threshold, we attempt to unload the least recently used metaslab
 to prevent the system from clogging all of its memory with range trees.
 This tunable sets the percentage of total system memory that is the threshold.
 .
 .It Sy zfs_metaslab_try_hard_before_gang Ns = Ns Sy 0 Ns | Ns 1 Pq int
 .Bl -item -compact
 .It
 If unset, we will first try normal allocation.
 .It
 If that fails then we will do a gang allocation.
 .It
 If that fails then we will do a "try hard" gang allocation.
 .It
 If that fails then we will have a multi-layer gang block.
 .El
 .Pp
 .Bl -item -compact
 .It
 If set, we will first try normal allocation.
 .It
 If that fails then we will do a "try hard" allocation.
 .It
 If that fails we will do a gang allocation.
 .It
 If that fails we will do a "try hard" gang allocation.
 .It
 If that fails then we will have a multi-layer gang block.
 .El
 .
 .It Sy zfs_metaslab_find_max_tries Ns = Ns Sy 100 Pq uint
 When not trying hard, we only consider this number of the best metaslabs.
 This improves performance, especially when there are many metaslabs per vdev
 and the allocation can't actually be satisfied
 (so we would otherwise iterate all metaslabs).
 .
 .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint
 When a vdev is added, target this number of metaslabs per top-level vdev.
 .
 .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq uint
 Default lower limit for metaslab size.
 .
 .It Sy zfs_vdev_max_ms_shift Ns = Ns Sy 34 Po 16 GiB Pc Pq uint
 Default upper limit for metaslab size.
 .
 .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq uint
 Maximum ashift used when optimizing for logical \[->] physical sector size on
 new
 top-level vdevs.
 May be increased up to
 .Sy ASHIFT_MAX Po 16 Pc ,
 but this may negatively impact pool space efficiency.
 .
 .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint
 Minimum ashift used when creating new top-level vdevs.
 .
 .It Sy zfs_vdev_min_ms_count Ns = Ns Sy 16 Pq uint
 Minimum number of metaslabs to create in a top-level vdev.
 .
 .It Sy vdev_validate_skip Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Skip label validation steps during pool import.
 Changing is not recommended unless you know what you're doing
 and are recovering a damaged label.
 .
 .It Sy zfs_vdev_ms_count_limit Ns = Ns Sy 131072 Po 128k Pc Pq uint
 Practical upper limit of total metaslabs per top-level vdev.
 .
 .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable metaslab group preloading.
 .
 .It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint
 Maximum number of metaslabs per group to preload
 .
 .It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint
 Percentage of CPUs to run a metaslab preload taskq
 .
 .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Give more weight to metaslabs with lower LBAs,
 assuming they have greater bandwidth,
 as is typically the case on a modern constant angular velocity disk drive.
 .
 .It Sy metaslab_unload_delay Ns = Ns Sy 32 Pq uint
 After a metaslab is used, we keep it loaded for this many TXGs, to attempt to
 reduce unnecessary reloading.
 Note that both this many TXGs and
 .Sy metaslab_unload_delay_ms
 milliseconds must pass before unloading will occur.
 .
 .It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq uint
 After a metaslab is used, we keep it loaded for this many milliseconds,
 to attempt to reduce unnecessary reloading.
 Note, that both this many milliseconds and
 .Sy metaslab_unload_delay
 TXGs must pass before unloading will occur.
 .
 .It Sy reference_history Ns = Ns Sy 3 Pq uint
 Maximum reference holders being tracked when reference_tracking_enable is
 active.
 .
 .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Track reference holders to
 .Sy refcount_t
 objects (debug builds only).
 .
 .It Sy send_holes_without_birth_time Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When set, the
 .Sy hole_birth
 optimization will not be used, and all holes will always be sent during a
 .Nm zfs Cm send .
 This is useful if you suspect your datasets are affected by a bug in
 .Sy hole_birth .
 .
 .It Sy spa_config_path Ns = Ns Pa /etc/zfs/zpool.cache Pq charp
 SPA config file.
 .
 .It Sy spa_asize_inflation Ns = Ns Sy 24 Pq uint
 Multiplication factor used to estimate actual disk consumption from the
 size of data being written.
 The default value is a worst case estimate,
 but lower values may be valid for a given pool depending on its configuration.
 Pool administrators who understand the factors involved
 may wish to specify a more realistic inflation factor,
 particularly if they operate close to quota or capacity limits.
 .
 .It Sy spa_load_print_vdev_tree Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Whether to print the vdev tree in the debugging message buffer during pool
 import.
 .
 .It Sy spa_load_verify_data Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Whether to traverse data blocks during an "extreme rewind"
 .Pq Fl X
 import.
 .Pp
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.
 If this parameter is unset, the traversal skips non-metadata blocks.
 It can be toggled once the
 import has started to stop or start the traversal of non-metadata blocks.
 .
 .It Sy spa_load_verify_metadata  Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Whether to traverse blocks during an "extreme rewind"
 .Pq Fl X
 pool import.
 .Pp
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.
 If this parameter is unset, the traversal is not performed.
 It can be toggled once the import has started to stop or start the traversal.
 .
 .It Sy spa_load_verify_shift Ns = Ns Sy 4 Po 1/16th Pc Pq uint
 Sets the maximum number of bytes to consume during pool import to the log2
 fraction of the target ARC size.
 .
 .It Sy spa_slop_shift Ns = Ns Sy 5 Po 1/32nd Pc Pq int
 Normally, we don't allow the last
 .Sy 3.2% Pq Sy 1/2^spa_slop_shift
 of space in the pool to be consumed.
 This ensures that we don't run the pool completely out of space,
 due to unaccounted changes (e.g. to the MOS).
 It also limits the worst-case time to allocate space.
 If we have less than this amount of free space,
 most ZPL operations (e.g. write, create) will return
 .Sy ENOSPC .
 .
 .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
 Limits the number of on-disk error log entries that will be converted to the
 new format when enabling the
 .Sy head_errlog
 feature.
 The default is to convert all log entries.
 .
 .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 During top-level vdev removal, chunks of data are copied from the vdev
 which may include free space in order to trade bandwidth for IOPS.
 This parameter determines the maximum span of free space, in bytes,
 which will be included as "unnecessary" data in a chunk of copied data.
 .Pp
 The default value here was chosen to align with
 .Sy zfs_vdev_read_gap_limit ,
 which is a similar concept when doing
 regular reads (but there's no reason it has to be the same).
 .
 .It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64
 Logical ashift for file-based devices.
 .
 .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64
 Physical ashift for file-based devices.
 .
 .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If set, when we start iterating over a ZAP object,
 prefetch the entire object (all leaf blocks).
 However, this is limited by
 .Sy dmu_prefetch_max .
 .
 .It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int
 Maximum micro ZAP size.
 A micro ZAP is upgraded to a fat ZAP, once it grows beyond the specified size.
 .
 .It Sy zfetch_hole_shift Ns = Ns Sy 2 Pq uint
 Log2 fraction of holes in speculative prefetch stream allowed for it to
 proceed.
 .
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
 this value, doubling on each hit.
 After that it may grow further by 1/8 per hit, but only if some prefetch
 since last time haven't completed in time to satisfy demand request, i.e.
 prefetch depth didn't cover the read latency or the pool got saturated.
 .
 .It Sy zfetch_max_distance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch per stream.
 .
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
 .It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 Requests within this byte distance from the current prefetch stream position
 are considered parts of the stream, reordered due to parallel processing.
 Such requests do not advance the stream position immediately unless
 .Sy zfetch_hole_shift
 fill threshold is reached, but saved to fill holes in the stream later.
 .
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
 .It Sy zfetch_min_sec_reap Ns = Ns Sy 1 Pq uint
 Min time before inactive prefetch stream can be reclaimed
 .
 .It Sy zfetch_max_sec_reap Ns = Ns Sy 2 Pq uint
 Max time before inactive prefetch stream can be deleted
 .
 .It Sy zfs_abd_scatter_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enables ARC from using scatter/gather lists and forces all allocations to be
 linear in kernel memory.
 Disabling can improve performance in some code paths
 at the expense of fragmented kernel memory.
 .
 .It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER\-1 Pq uint
 Maximum number of consecutive memory pages allocated in a single block for
 scatter/gather lists.
 .Pp
 The value of
 .Sy MAX_ORDER
 depends on kernel configuration.
 .
 .It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5 KiB Pc Pq uint
 This is the minimum allocation size that will use scatter (page-based) ABDs.
 Smaller allocations will use linear ABDs.
 .
 .It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64
 When the number of bytes consumed by dnodes in the ARC exceeds this number of
 bytes, try to unpin some of it in response to demand for non-metadata.
 This value acts as a ceiling to the amount of dnode metadata, and defaults to
 .Sy 0 ,
 which indicates that a percent which is based on
 .Sy zfs_arc_dnode_limit_percent
 of the ARC meta buffers that may be used for dnodes.
 .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64
 Percentage that can be consumed by dnodes of ARC meta buffers.
 .Pp
 See also
 .Sy zfs_arc_dnode_limit ,
 which serves a similar purpose but has a higher priority if nonzero.
 .
 .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64
 Percentage of ARC dnodes to try to scan in response to demand for non-metadata
 when the number of bytes consumed by dnodes exceeds
 .Sy zfs_arc_dnode_limit .
 .
 .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint
 The ARC's buffer hash table is sized based on the assumption of an average
 block size of this value.
 This works out to roughly 1 MiB of hash table per 1 GiB of physical memory
 with 8-byte pointers.
 For configurations with a known larger average block size,
 this value can be increased to reduce the memory footprint.
 .
 .It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint
 When
 .Fn arc_is_overflowing ,
 .Fn arc_get_data_impl
 waits for this percent of the requested amount of data to be evicted.
 For example, by default, for every
 .Em 2 KiB
 that's evicted,
 .Em 1 KiB
 of it may be "reused" by a new allocation.
 Since this is above
 .Sy 100 Ns % ,
 it ensures that progress is made towards getting
 .Sy arc_size No under Sy arc_c .
 Since this is finite, it ensures that allocations can still happen,
 even during the potentially long time that
 .Sy arc_size No is more than Sy arc_c .
 .
 .It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint
 Number ARC headers to evict per sub-list before proceeding to another sub-list.
 This batch-style operation prevents entire sub-lists from being evicted at once
 but comes at a cost of additional unlocking and locking.
 .
 .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
 If set to a non zero value, it will replace the
 .Sy arc_grow_retry
 value with this value.
 The
 .Sy arc_grow_retry
 .No value Pq default Sy 5 Ns s
 is the number of seconds the ARC will wait before
 trying to resume growth after a memory pressure event.
 .
 .It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int
 Throttle I/O when free system memory drops below this percentage of total
 system memory.
 Setting this value to
 .Sy 0
 will disable the throttle.
 .
 .It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64
 Max size of ARC in bytes.
 If
 .Sy 0 ,
 then the max size of ARC is determined by the amount of system memory installed.
 Under Linux, half of system memory will be used as the limit.
 Under
 .Fx ,
 the larger of
 .Sy all_system_memory No \- Sy 1 GiB
 and
 .Sy 5/8 No \(mu Sy all_system_memory
 will be used as the limit.
 This value must be at least
 .Sy 67108864 Ns B Pq 64 MiB .
 .Pp
 This value can be changed dynamically, with some caveats.
 It cannot be set back to
 .Sy 0
 while running, and reducing it below the current ARC size will not cause
 the ARC to shrink without memory pressure to induce shrinking.
 .
 .It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint
 Balance between metadata and data on ghost hits.
 Values above 100 increase metadata caching by proportionally reducing effect
 of ghost data hits on target data/metadata rate.
 .
 .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64
 Min size of ARC in bytes.
 .No If set to Sy 0 , arc_c_min
 will default to consuming the larger of
 .Sy 32 MiB
 and
 .Sy all_system_memory No / Sy 32 .
 .
 .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint
 Minimum time prefetched blocks are locked in the ARC.
 .
 .It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint
 Minimum time "prescient prefetched" blocks are locked in the ARC.
 These blocks are meant to be prefetched fairly aggressively ahead of
 the code that may use them.
 .
 .It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int
 Number of arc_prune threads.
 .Fx
 does not need more than one.
 Linux may theoretically use one per mount point up to number of CPUs,
 but that was not proven to be useful.
 .
 .It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int
 Number of missing top-level vdevs which will be allowed during
 pool import (only in read-only mode).
 .
 .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64
 Maximum size in bytes allowed to be passed as
 .Sy zc_nvlist_src_size
 for ioctls on
 .Pa /dev/zfs .
 This prevents a user from causing the kernel to allocate
 an excessive amount of memory.
 When the limit is exceeded, the ioctl fails with
 .Sy EINVAL
 and a description of the error is sent to the
 .Pa zfs-dbgmsg
 log.
 This parameter should not need to be touched under normal circumstances.
 If
 .Sy 0 ,
 equivalent to a quarter of the user-wired memory limit under
 .Fx
 and to
 .Sy 134217728 Ns B Pq 128 MiB
 under Linux.
 .
 .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint
 To allow more fine-grained locking, each ARC state contains a series
 of lists for both data and metadata objects.
 Locking is performed at the level of these "sub-lists".
 This parameters controls the number of sub-lists per ARC state,
 and also applies to other uses of the multilist data structure.
 .Pp
 If
 .Sy 0 ,
 equivalent to the greater of the number of online CPUs and
 .Sy 4 .
 .
 .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int
 The ARC size is considered to be overflowing if it exceeds the current
 ARC target size
 .Pq Sy arc_c
 by thresholds determined by this parameter.
 Exceeding by
 .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2
 starts ARC reclamation process.
 If that appears insufficient, exceeding by
 .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5
 blocks new buffer allocation until the reclaim thread catches up.
 Started reclamation process continues till ARC size returns below the
 target size.
 .Pp
 The default value of
 .Sy 8
 causes the ARC to start reclamation if it exceeds the target size by
 .Em 0.2%
 of the target size, and block allocations by
 .Em 0.6% .
 .
 .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint
 If nonzero, this will update
 .Sy arc_shrink_shift Pq default Sy 7
 with the new value.
 .
 .It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint
 Percent of pagecache to reclaim ARC to.
 .Pp
 This tunable allows the ZFS ARC to play more nicely
 with the kernel's LRU pagecache.
 It can guarantee that the ARC size won't collapse under scanning
 pressure on the pagecache, yet still allows the ARC to be reclaimed down to
 .Sy zfs_arc_min
 if necessary.
 This value is specified as percent of pagecache size (as measured by
 .Sy NR_FILE_PAGES ) ,
 where that percent may exceed
 .Sy 100 .
 This
 only operates during memory pressure/reclaim.
 .
 .It Sy zfs_arc_shrinker_limit Ns = Ns Sy 10000 Pq int
 This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
 .Pp
 The default limit of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
 limits the amount of time spent attempting to reclaim ARC memory to
 less than 100 ms per allocation attempt,
 even with a small average compressed block size of ~8 KiB.
 .Pp
 The parameter can be set to 0 (zero) to disable the limit,
 and only applies on Linux.
 .
 .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64
 The target number of bytes the ARC should leave as free memory on the system.
 If zero, equivalent to the bigger of
 .Sy 512 KiB No and Sy all_system_memory/64 .
 .
 .It Sy zfs_autoimport_disable Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Disable pool import at module load by ignoring the cache file
 .Pq Sy spa_config_path .
 .
 .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint
 Rate limit checksum events to this many per second.
 Note that this should not be set below the ZED thresholds
 (currently 10 checksums over 10 seconds)
 or else the daemon may not trigger any action.
 .
 .It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.
 The timeout is scaled based on a percentage of the last lwb
 latency to avoid significantly impacting the latency of each individual
 transaction record (itx).
 .
 .It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int
 Vdev indirection layer (used for device removal) sleeps for this many
 milliseconds during mapping generation.
 Intended for use with the test suite to throttle vdev removal speed.
 .
 .It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint
 Minimum percent of obsolete bytes in vdev mapping required to attempt to
 condense
 .Pq see Sy zfs_condense_indirect_vdevs_enable .
 Intended for use with the test suite
 to facilitate triggering condensing as needed.
 .
 .It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable condensing indirect vdev mappings.
 When set, attempt to condense indirect vdev mappings
 if the mapping uses more than
 .Sy zfs_condense_min_mapping_bytes
 bytes of memory and if the obsolete space map object uses more than
 .Sy zfs_condense_max_obsolete_bytes
 bytes on-disk.
 The condensing process is an attempt to save memory by removing obsolete
 mappings.
 .
 .It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
 Only attempt to condense indirect vdev mappings if the on-disk size
 of the obsolete space map object is greater than this number of bytes
 .Pq see Sy zfs_condense_indirect_vdevs_enable .
 .
 .It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64
 Minimum size vdev mapping to attempt to condense
 .Pq see Sy zfs_condense_indirect_vdevs_enable .
 .
 .It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Internally ZFS keeps a small log to facilitate debugging.
 The log is enabled by default, and can be disabled by unsetting this option.
 The contents of the log can be accessed by reading
 .Pa /proc/spl/kstat/zfs/dbgmsg .
 Writing
 .Sy 0
 to the file clears the log.
 .Pp
 This setting does not influence debug prints due to
 .Sy zfs_flags .
 .
 .It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Maximum size of the internal ZFS debug log.
 .
 .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int
 Historically used for controlling what reporting was available under
 .Pa /proc/spl/kstat/zfs .
 No effect.
 .
 .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When a pool sync operation takes longer than
 .Sy zfs_deadman_synctime_ms ,
 or when an individual I/O operation takes longer than
 .Sy zfs_deadman_ziotime_ms ,
 then the operation is considered to be "hung".
 If
 .Sy zfs_deadman_enabled
 is set, then the deadman behavior is invoked as described by
 .Sy zfs_deadman_failmode .
 By default, the deadman is enabled and set to
 .Sy wait
 which results in "hung" I/O operations only being logged.
 The deadman is automatically disabled when a pool gets suspended.
 .
 .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp
 Controls the failure behavior when the deadman detects a "hung" I/O operation.
 Valid values are:
 .Bl -tag -compact -offset 4n -width "continue"
 .It Sy wait
 Wait for a "hung" operation to complete.
 For each "hung" operation a "deadman" event will be posted
 describing that operation.
 .It Sy continue
 Attempt to recover from a "hung" operation by re-dispatching it
 to the I/O pipeline if possible.
 .It Sy panic
 Panic the system.
 This can be used to facilitate automatic fail-over
 to a properly configured fail-over partner.
 .El
 .
 .It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64
 Check time in milliseconds.
 This defines the frequency at which we check for hung I/O requests
 and potentially invoke the
 .Sy zfs_deadman_failmode
 behavior.
 .
 .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64
 Interval in milliseconds after which the deadman is triggered and also
 the interval after which a pool sync operation is considered to be "hung".
 Once this limit is exceeded the deadman will be invoked every
 .Sy zfs_deadman_checktime_ms
 milliseconds until the pool sync completes.
 .
 .It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64
 Interval in milliseconds after which the deadman is triggered and an
 individual I/O operation is considered to be "hung".
 As long as the operation remains "hung",
 the deadman will be invoked every
 .Sy zfs_deadman_checktime_ms
 milliseconds until the operation completes.
 .
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
 .Sy zfs_dirty_data_max .
 This value should be at least
 .Sy zfs_vdev_async_write_active_max_dirty_percent .
 .No See Sx ZFS TRANSACTION DELAY .
 .
 .It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int
 This controls how quickly the transaction delay approaches infinity.
 Larger values cause longer delays for a given amount of dirty data.
 .Pp
 For the smoothest delay, this value should be about 1 billion divided
 by the maximum number of operations per second.
 This will smoothly handle between ten times and a tenth of this number.
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 .
 .
 .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disables requirement for IVset GUIDs to be present and match when doing a raw
 receive of encrypted datasets.
 Intended for users whose pools were created with
 OpenZFS pre-release versions and now have compatibility issues.
 .
 .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong
 Maximum number of uses of a single salt value before generating a new one for
 encrypted datasets.
 The default value is also the maximum.
 .
 .It Sy zfs_object_mutex_size Ns = Ns Sy 64 Pq uint
 Size of the znode hashtable used for holds.
 .Pp
 Due to the need to hold locks on objects that may not exist yet, kernel mutexes
 are not created per-object and instead a hashtable is used where collisions
 will result in objects waiting when there is not actually contention on the
 same object.
 .
 .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int
 Rate limit delay and deadman zevents (which report slow I/O operations) to this
 many per
 second.
 .
 .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
 Upper-bound limit for unflushed metadata changes to be held by the
 log spacemap in memory, in bytes.
 .
 .It Sy zfs_unflushed_max_mem_ppm Ns = Ns Sy 1000 Ns ppm Po 0.1% Pc Pq u64
 Part of overall system memory that ZFS allows to be used
 for unflushed metadata changes by the log spacemap, in millionths.
 .
 .It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq u64
 Describes the maximum number of log spacemap blocks allowed for each pool.
 The default value means that the space in all the log spacemaps
 can add up to no more than
 .Sy 131072
 blocks (which means
 .Em 16 GiB
 of logical space before compression and ditto blocks,
 assuming that blocksize is
 .Em 128 KiB ) .
 .Pp
 This tunable is important because it involves a trade-off between import
 time after an unclean export and the frequency of flushing metaslabs.
 The higher this number is, the more log blocks we allow when the pool is
 active which means that we flush metaslabs less often and thus decrease
 the number of I/O operations for spacemap updates per TXG.
 At the same time though, that means that in the event of an unclean export,
 there will be more log spacemap blocks for us to read, inducing overhead
 in the import time of the pool.
 The lower the number, the amount of flushing increases, destroying log
 blocks quicker as they become obsolete faster, which leaves less blocks
 to be read during import time after a crash.
 .Pp
 Each log spacemap block existing during pool import leads to approximately
 one extra logical I/O issued.
 This is the reason why this tunable is exposed in terms of blocks rather
 than space used.
 .
 .It Sy zfs_unflushed_log_block_min Ns = Ns Sy 1000 Pq u64
 If the number of metaslabs is small and our incoming rate is high,
 we could get into a situation that we are flushing all our metaslabs every TXG.
 Thus we always allow at least this many log blocks.
 .
 .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq u64
 Tunable used to determine the number of blocks that can be used for
 the spacemap log, expressed as a percentage of the total number of
 unflushed metaslabs in the pool.
 .
 .It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq u64
 Tunable limiting maximum time in TXGs any metaslab may remain unflushed.
 It effectively limits maximum number of unflushed per-TXG spacemap logs
 that need to be read after unclean pool export.
 .
 .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When enabled, files will not be asynchronously removed from the list of pending
 unlinks and the space they consume will be leaked.
 Once this option has been disabled and the dataset is remounted,
 the pending unlinks will be processed and the freed space returned to the pool.
 This option is used by the test suite.
 .
 .It Sy zfs_delete_blocks Ns = Ns Sy 20480 Pq ulong
 This is the used to define a large file for the purposes of deletion.
 Files containing more than
 .Sy zfs_delete_blocks
 will be deleted asynchronously, while smaller files are deleted synchronously.
 Decreasing this value will reduce the time spent in an
 .Xr unlink 2
 system call, at the expense of a longer delay before the freed space is
 available.
 This only applies on Linux.
 .
 .It Sy zfs_dirty_data_max Ns = Pq int
 Determines the dirty space limit in bytes.
 Once this limit is exceeded, new writes are halted until space frees up.
 This parameter takes precedence over
 .Sy zfs_dirty_data_max_percent .
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 Defaults to
 .Sy physical_ram/10 ,
 capped at
 .Sy zfs_dirty_data_max_max .
 .
 .It Sy zfs_dirty_data_max_max Ns = Pq int
 Maximum allowable value of
 .Sy zfs_dirty_data_max ,
 expressed in bytes.
 This limit is only enforced at module load time, and will be ignored if
 .Sy zfs_dirty_data_max
 is later changed.
 This parameter takes precedence over
 .Sy zfs_dirty_data_max_max_percent .
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 Defaults to
 .Sy min(physical_ram/4, 4GiB) ,
 or
 .Sy min(physical_ram/4, 1GiB)
 for 32-bit systems.
 .
 .It Sy zfs_dirty_data_max_max_percent Ns = Ns Sy 25 Ns % Pq uint
 Maximum allowable value of
 .Sy zfs_dirty_data_max ,
 expressed as a percentage of physical RAM.
 This limit is only enforced at module load time, and will be ignored if
 .Sy zfs_dirty_data_max
 is later changed.
 The parameter
 .Sy zfs_dirty_data_max_max
 takes precedence over this one.
 .No See Sx ZFS TRANSACTION DELAY .
 .
 .It Sy zfs_dirty_data_max_percent Ns = Ns Sy 10 Ns % Pq uint
 Determines the dirty space limit, expressed as a percentage of all memory.
 Once this limit is exceeded, new writes are halted until space frees up.
 The parameter
 .Sy zfs_dirty_data_max
 takes precedence over this one.
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 Subject to
 .Sy zfs_dirty_data_max_max .
 .
 .It Sy zfs_dirty_data_sync_percent Ns = Ns Sy 20 Ns % Pq uint
 Start syncing out a transaction group if there's at least this much dirty data
 .Pq as a percentage of Sy zfs_dirty_data_max .
 This should be less than
 .Sy zfs_vdev_async_write_active_min_dirty_percent .
 .
 .It Sy zfs_wrlog_data_max Ns = Pq int
 The upper limit of write-transaction zil log data size in bytes.
 Write operations are throttled when approaching the limit until log data is
 cleared out after transaction group sync.
 Because of some overhead, it should be set at least 2 times the size of
 .Sy zfs_dirty_data_max
 .No to prevent harming normal write throughput .
 It also should be smaller than the size of the slog device if slog is present.
 .Pp
 Defaults to
 .Sy zfs_dirty_data_max*2
 .
 .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
 Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
 preallocated for a file in order to guarantee that later writes will not
 run out of space.
 Instead,
 .Xr fallocate 2
 space preallocation only checks that sufficient space is currently available
 in the pool or the user's project quota allocation,
 and then creates a sparse file of the requested size.
 The requested space is multiplied by
 .Sy zfs_fallocate_reserve_percent
 to allow additional space for indirect blocks and other internal metadata.
 Setting this to
 .Sy 0
 disables support for
 .Xr fallocate 2
 and causes it to return
 .Sy EOPNOTSUPP .
 .
 .It Sy zfs_fletcher_4_impl Ns = Ns Sy fastest Pq string
 Select a fletcher 4 implementation.
 .Pp
 Supported selectors are:
 .Sy fastest , scalar , sse2 , ssse3 , avx2 , avx512f , avx512bw ,
 .No and Sy aarch64_neon .
 All except
 .Sy fastest No and Sy scalar
 require instruction set extensions to be available,
 and will only appear if ZFS detects that they are present at runtime.
 If multiple implementations of fletcher 4 are available, the
 .Sy fastest
 will be chosen using a micro benchmark.
 Selecting
 .Sy scalar
 results in the original CPU-based calculation being used.
 Selecting any option other than
 .Sy fastest No or Sy scalar
 results in vector instructions
 from the respective CPU instruction set being used.
 .
 .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable the experimental block cloning feature.
 If this setting is 0, then even if feature@block_cloning is enabled,
 attempts to clone blocks will act as though the feature is disabled.
 .
 .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int
 When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be
 written to disk.
 This allows the clone operation to reliably succeed when a file is
 modified and then immediately cloned.
 For small files this may be slower than making a copy of the file.
 Therefore, this setting defaults to 0 which causes a clone operation to
 immediately fail when encountering a dirty block.
 .
 .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
 Select a BLAKE3 implementation.
 .Pp
 Supported selectors are:
 .Sy cycle , fastest , generic , sse2 , sse41 , avx2 , avx512 .
 All except
 .Sy cycle , fastest No and Sy generic
 require instruction set extensions to be available,
 and will only appear if ZFS detects that they are present at runtime.
 If multiple implementations of BLAKE3 are available, the
 .Sy fastest will be chosen using a micro benchmark. You can see the
 benchmark results by reading this kstat file:
 .Pa /proc/spl/kstat/zfs/chksum_bench .
 .
 .It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable/disable the processing of the free_bpobj object.
 .
 .It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64
 Maximum number of blocks freed in a single TXG.
 .
 .It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64
 Maximum number of dedup blocks freed in a single TXG.
 .
 .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint
 Maximum asynchronous read I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_read_min_active Ns = Ns Sy 1 Pq uint
 Minimum asynchronous read I/O operation active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_active_max_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 When the pool has more than this much dirty data, use
 .Sy zfs_vdev_async_write_max_active
 to limit active async writes.
 If the dirty data is between the minimum and maximum,
 the active I/O limit is linearly interpolated.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_active_min_dirty_percent Ns = Ns Sy 30 Ns % Pq uint
 When the pool has less than this much dirty data, use
 .Sy zfs_vdev_async_write_min_active
 to limit active async writes.
 If the dirty data is between the minimum and maximum,
 the active I/O limit is linearly
 interpolated.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_max_active Ns = Ns Sy 10 Pq uint
 Maximum asynchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_min_active Ns = Ns Sy 2 Pq uint
 Minimum asynchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .Pp
 Lower values are associated with better latency on rotational media but poorer
 resilver performance.
 The default value of
 .Sy 2
 was chosen as a compromise.
 A value of
 .Sy 3
 has been shown to improve resilver performance further at a cost of
 further increasing latency.
 .
 .It Sy zfs_vdev_initializing_max_active Ns = Ns Sy 1 Pq uint
 Maximum initializing I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_initializing_min_active Ns = Ns Sy 1 Pq uint
 Minimum initializing I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_max_active Ns = Ns Sy 1000 Pq uint
 The maximum number of I/O operations active to each device.
 Ideally, this will be at least the sum of each queue's
 .Sy max_active .
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_open_timeout_ms Ns = Ns Sy 1000 Pq uint
 Timeout value to wait before determining a device is missing
 during import.
 This is helpful for transient missing paths due
 to links being briefly removed and recreated in response to
 udev events.
 .
 .It Sy zfs_vdev_rebuild_max_active Ns = Ns Sy 3 Pq uint
 Maximum sequential resilver I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_rebuild_min_active Ns = Ns Sy 1 Pq uint
 Minimum sequential resilver I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_removal_max_active Ns = Ns Sy 2 Pq uint
 Maximum removal I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_removal_min_active Ns = Ns Sy 1 Pq uint
 Minimum removal I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_scrub_max_active Ns = Ns Sy 2 Pq uint
 Maximum scrub I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_scrub_min_active Ns = Ns Sy 1 Pq uint
 Minimum scrub I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_read_max_active Ns = Ns Sy 10 Pq uint
 Maximum synchronous read I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_read_min_active Ns = Ns Sy 10 Pq uint
 Minimum synchronous read I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_write_max_active Ns = Ns Sy 10 Pq uint
 Maximum synchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_write_min_active Ns = Ns Sy 10 Pq uint
 Minimum synchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_trim_max_active Ns = Ns Sy 2 Pq uint
 Maximum trim/discard I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_trim_min_active Ns = Ns Sy 1 Pq uint
 Minimum trim/discard I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_nia_delay Ns = Ns Sy 5 Pq uint
 For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
 the number of concurrently-active I/O operations is limited to
 .Sy zfs_*_min_active ,
 unless the vdev is "idle".
 When there are no interactive I/O operations active (synchronous or otherwise),
 and
 .Sy zfs_vdev_nia_delay
 operations have completed since the last interactive operation,
 then the vdev is considered to be "idle",
 and the number of concurrently-active non-interactive operations is increased to
 .Sy zfs_*_max_active .
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_nia_credit Ns = Ns Sy 5 Pq uint
 Some HDDs tend to prioritize sequential I/O so strongly, that concurrent
 random I/O latency reaches several seconds.
 On some HDDs this happens even if sequential I/O operations
 are submitted one at a time, and so setting
 .Sy zfs_*_max_active Ns = Sy 1
 does not help.
 To prevent non-interactive I/O, like scrub,
 from monopolizing the device, no more than
 .Sy zfs_vdev_nia_credit operations can be sent
 while there are outstanding incomplete interactive operations.
 This enforced wait ensures the HDD services the interactive I/O
 within a reasonable amount of time.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_queue_depth_pct Ns = Ns Sy 1000 Ns % Pq uint
 Maximum number of queued allocations per top-level vdev expressed as
 a percentage of
 .Sy zfs_vdev_async_write_max_active ,
 which allows the system to detect devices that are more capable
 of handling allocations and to allocate more blocks to those devices.
 This allows for dynamic allocation distribution when devices are imbalanced,
 as fuller devices will tend to be slower than empty devices.
 .Pp
 Also see
 .Sy zio_dva_throttle_enabled .
 .
 .It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint
 Default queue depth for each vdev IO allocator.
 Higher values allow for better coalescing of sequential writes before sending
 them to the disk, but can increase transaction commit times.
 .
 .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
 Defines if the driver should retire on a given error type.
 The following options may be bitwise-ored together:
 .TS
 box;
 lbz r l l .
 	Value	Name	Description
 _
 	1	Device	No driver retries on device errors
 	2	Transport	No driver retries on transport errors.
 	4	Driver	No driver retries on driver errors.
 .TE
 .
 .It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
 Maximum number of segments to add to a BIO (min 4).
 If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
 This parameter is ignored if
 .Sy zfs_vdev_disk_classic Ns = Ns Sy 1 .
 .
 .It Sy zfs_vdev_disk_classic Ns = Ns 0 Ns | Ns Sy 1 Pq uint
 Controls the method used to submit IO to the Linux block layer
 (default
 .Sy 1 "classic" Ns
 )
 .Pp
 If set to 1, the "classic" method is used.
 This is the method that has been in use since the earliest versions of
 ZFS-on-Linux.
 It has known issues with highly fragmented IO requests and is less efficient on
 many workloads, but it well known and well understood.
 .Pp
 If set to 0, the "new" method is used.
 This method is available since 2.2.4 and should resolve all known issues and be
 far more efficient, but has not had as much testing.
 In the 2.2.x series, this parameter defaults to 1, to use the "classic" method.
 .Pp
 It is not recommended that you change it except on advice from the OpenZFS
 developers.
 If you do change it, please also open a bug report describing why you did so,
 including the workload involved and any error messages.
 .Pp
 This parameter and the "classic" submission method will be removed in a future
 release of OpenZFS once we have total confidence in the new method.
 .Pp
 This parameter only applies on Linux, and can only be set at module load time.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
 .
 .It Sy zfs_admin_snapshot Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Allow the creation, removal, or renaming of entries in the
 .Sy .zfs/snapshot
 directory to cause the creation, destruction, or renaming of snapshots.
 When enabled, this functionality works both locally and over NFS exports
 which have the
 .Em no_root_squash
 option set.
 .
 .It Sy zfs_flags Ns = Ns Sy 0 Pq int
 Set additional debugging flags.
 The following flags may be bitwise-ored together:
 .TS
 box;
 lbz r l l .
 	Value	Name	Description
 _
 	1	ZFS_DEBUG_DPRINTF	Enable dprintf entries in the debug log.
 *	2	ZFS_DEBUG_DBUF_VERIFY	Enable extra dbuf verifications.
 *	4	ZFS_DEBUG_DNODE_VERIFY	Enable extra dnode verifications.
 	8	ZFS_DEBUG_SNAPNAMES	Enable snapshot name verification.
 *	16	ZFS_DEBUG_MODIFY	Check for illegally modified ARC buffers.
 	64	ZFS_DEBUG_ZIO_FREE	Enable verification of block frees.
 	128	ZFS_DEBUG_HISTOGRAM_VERIFY	Enable extra spacemap histogram verifications.
 	256	ZFS_DEBUG_METASLAB_VERIFY	Verify space accounting on disk matches in-memory \fBrange_trees\fP.
 	512	ZFS_DEBUG_SET_ERROR	Enable \fBSET_ERROR\fP and dprintf entries in the debug log.
 	1024	ZFS_DEBUG_INDIRECT_REMAP	Verify split blocks created by device removal.
 	2048	ZFS_DEBUG_TRIM	Verify TRIM ranges are always within the allocatable range tree.
 	4096	ZFS_DEBUG_LOG_SPACEMAP	Verify that the log summary is consistent with the spacemap log
 			       and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing.
 .TE
 .Sy \& * No Requires debug build .
 .
 .It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint
 Enables btree verification.
 The following settings are culminative:
 .TS
 box;
 lbz r l l .
 	Value	Description
 
 	1	Verify height.
 	2	Verify pointers from children to parent.
 	3	Verify element counts.
 	4	Verify element order. (expensive)
 *	5	Verify unused memory is poisoned. (expensive)
 .TE
 .Sy \& * No Requires debug build .
 .
 .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If destroy encounters an
 .Sy EIO
 while reading metadata (e.g. indirect blocks),
 space referenced by the missing metadata can not be freed.
 Normally this causes the background destroy to become "stalled",
 as it is unable to make forward progress.
 While in this stalled state, all remaining space to free
 from the error-encountering filesystem is "temporarily leaked".
 Set this flag to cause it to ignore the
 .Sy EIO ,
 permanently leak the space from indirect blocks that can not be read,
 and continue to free everything else that it can.
 .Pp
 The default "stalling" behavior is useful if the storage partially
 fails (i.e. some but not all I/O operations fail), and then later recovers.
 In this case, we will be able to continue pool operations while it is
 partially failed, and when it recovers, we can continue to free the
 space, with no leaks.
 Note, however, that this case is actually fairly rare.
 .Pp
 Typically pools either
 .Bl -enum -compact -offset 4n -width "1."
 .It
 fail completely (but perhaps temporarily,
 e.g. due to a top-level vdev going offline), or
 .It
 have localized, permanent errors (e.g. disk returns the wrong data
 due to bit flip or firmware bug).
 .El
 In the former case, this setting does not matter because the
 pool will be suspended and the sync thread will not be able to make
 forward progress regardless.
 In the latter, because the error is permanent, the best we can do
 is leak the minimum amount of space,
 which is what setting this flag will do.
 It is therefore reasonable for this flag to normally be set,
 but we chose the more conservative approach of not setting it,
 so that there is no possibility of
 leaking space in the "partial temporary" failure case.
 .
 .It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint
 During a
 .Nm zfs Cm destroy
 operation using the
 .Sy async_destroy
 feature,
 a minimum of this much time will be spent working on freeing blocks per TXG.
 .
 .It Sy zfs_obsolete_min_time_ms Ns = Ns Sy 500 Ns ms Pq uint
 Similar to
 .Sy zfs_free_min_time_ms ,
 but for cleanup of old indirection records for removed vdevs.
 .
 .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64
 Largest data block to write to the ZIL.
 Larger blocks will be treated as if the dataset being written to had the
 .Sy logbias Ns = Ns Sy throughput
 property set.
 .
 .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64
 Pattern written to vdev free space by
 .Xr zpool-initialize 8 .
 .
 .It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 Size of writes used by
 .Xr zpool-initialize 8 .
 This option is used by the test suite.
 .
 .It Sy zfs_livelist_max_entries Ns = Ns Sy 500000 Po 5*10^5 Pc Pq u64
 The threshold size (in block pointers) at which we create a new sub-livelist.
 Larger sublists are more costly from a memory perspective but the fewer
 sublists there are, the lower the cost of insertion.
 .
 .It Sy zfs_livelist_min_percent_shared Ns = Ns Sy 75 Ns % Pq int
 If the amount of shared space between a snapshot and its clone drops below
 this threshold, the clone turns off the livelist and reverts to the old
 deletion method.
 This is in place because livelists no long give us a benefit
 once a clone has been overwritten enough.
 .
 .It Sy zfs_livelist_condense_new_alloc Ns = Ns Sy 0 Pq int
 Incremented each time an extra ALLOC blkptr is added to a livelist entry while
 it is being condensed.
 This option is used by the test suite to track race conditions.
 .
 .It Sy zfs_livelist_condense_sync_cancel Ns = Ns Sy 0 Pq int
 Incremented each time livelist condensing is canceled while in
 .Fn spa_livelist_condense_sync .
 This option is used by the test suite to track race conditions.
 .
 .It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int
 When set, the livelist condense process pauses indefinitely before
 executing the synctask \(em
 .Fn spa_livelist_condense_sync .
 This option is used by the test suite to trigger race conditions.
 .
 .It Sy zfs_livelist_condense_zthr_cancel Ns = Ns Sy 0 Pq int
 Incremented each time livelist condensing is canceled while in
 .Fn spa_livelist_condense_cb .
 This option is used by the test suite to track race conditions.
 .
 .It Sy zfs_livelist_condense_zthr_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int
 When set, the livelist condense process pauses indefinitely before
 executing the open context condensing work in
 .Fn spa_livelist_condense_cb .
 This option is used by the test suite to trigger race conditions.
 .
 .It Sy zfs_lua_max_instrlimit Ns = Ns Sy 100000000 Po 10^8 Pc Pq u64
 The maximum execution time limit that can be set for a ZFS channel program,
 specified as a number of Lua instructions.
 .
 .It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100 MiB Pc Pq u64
 The maximum memory limit that can be set for a ZFS channel program, specified
 in bytes.
 .
 .It Sy zfs_max_dataset_nesting Ns = Ns Sy 50 Pq int
 The maximum depth of nested datasets.
 This value can be tuned temporarily to
 fix existing datasets that exceed the predefined limit.
 .
 .It Sy zfs_max_log_walking Ns = Ns Sy 5 Pq u64
 The number of past TXGs that the flushing algorithm of the log spacemap
 feature uses to estimate incoming log blocks.
 .
 .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq u64
 Maximum number of rows allowed in the summary of the spacemap log.
 .
 .It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16 MiB Pc Pq uint
 We currently support block sizes from
 .Em 512 Po 512 B Pc No to Em 16777216 Po 16 MiB Pc .
 The benefits of larger blocks, and thus larger I/O,
 need to be weighed against the cost of COWing a giant block to modify one byte.
 Additionally, very large blocks can have an impact on I/O latency,
 and also potentially on the memory allocator.
 Therefore, we formerly forbade creating blocks larger than 1M.
 Larger blocks could be created by changing it,
 and pools with larger blocks can always be imported and used,
 regardless of this setting.
 .
 .It Sy zfs_allow_redacted_dataset_mount Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Allow datasets received with redacted send/receive to be mounted.
 Normally disabled because these datasets may be missing key data.
 .
 .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64
 Minimum number of metaslabs to flush per dirty TXG.
 .
 .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 70 Ns % Pq uint
 Allow metaslabs to keep their active state as long as their fragmentation
 percentage is no more than this value.
 An active metaslab that exceeds this threshold
 will no longer keep its active status allowing better metaslabs to be selected.
 .
 .It Sy zfs_mg_fragmentation_threshold Ns = Ns Sy 95 Ns % Pq uint
 Metaslab groups are considered eligible for allocations if their
 fragmentation metric (measured as a percentage) is less than or equal to
 this value.
 If a metaslab group exceeds this threshold then it will be
 skipped unless all metaslab groups within the metaslab class have also
 crossed this threshold.
 .
 .It Sy zfs_mg_noalloc_threshold Ns = Ns Sy 0 Ns % Pq uint
 Defines a threshold at which metaslab groups should be eligible for allocations.
 The value is expressed as a percentage of free space
 beyond which a metaslab group is always eligible for allocations.
 If a metaslab group's free space is less than or equal to the
 threshold, the allocator will avoid allocating to that group
 unless all groups in the pool have reached the threshold.
 Once all groups have reached the threshold, all groups are allowed to accept
 allocations.
 The default value of
 .Sy 0
 disables the feature and causes all metaslab groups to be eligible for
 allocations.
 .Pp
 This parameter allows one to deal with pools having heavily imbalanced
 vdevs such as would be the case when a new vdev has been added.
 Setting the threshold to a non-zero percentage will stop allocations
 from being made to vdevs that aren't filled to the specified percentage
 and allow lesser filled vdevs to acquire more allocations than they
 otherwise would under the old
 .Sy zfs_mg_alloc_failures
 facility.
 .
 .It Sy zfs_ddt_data_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If enabled, ZFS will place DDT data into the special allocation class.
 .
 .It Sy zfs_user_indirect_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If enabled, ZFS will place user data indirect blocks
 into the special allocation class.
 .
 .It Sy zfs_multihost_history Ns = Ns Sy 0 Pq uint
 Historical statistics for this many latest multihost updates will be available
 in
 .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost .
 .
 .It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq u64
 Used to control the frequency of multihost writes which are performed when the
 .Sy multihost
 pool property is on.
 This is one of the factors used to determine the
 length of the activity check during import.
 .Pp
 The multihost write period is
 .Sy zfs_multihost_interval No / Sy leaf-vdevs .
 On average a multihost write will be issued for each leaf vdev
 every
 .Sy zfs_multihost_interval
 milliseconds.
 In practice, the observed period can vary with the I/O load
 and this observed value is the delay which is stored in the uberblock.
 .
 .It Sy zfs_multihost_import_intervals Ns = Ns Sy 20 Pq uint
 Used to control the duration of the activity test on import.
 Smaller values of
 .Sy zfs_multihost_import_intervals
 will reduce the import time but increase
 the risk of failing to detect an active pool.
 The total activity check time is never allowed to drop below one second.
 .Pp
 On import the activity check waits a minimum amount of time determined by
 .Sy zfs_multihost_interval No \(mu Sy zfs_multihost_import_intervals ,
 or the same product computed on the host which last had the pool imported,
 whichever is greater.
 The activity check time may be further extended if the value of MMP
 delay found in the best uberblock indicates actual multihost updates happened
 at longer intervals than
 .Sy zfs_multihost_interval .
 A minimum of
 .Em 100 ms
 is enforced.
 .Pp
 .Sy 0 No is equivalent to Sy 1 .
 .
 .It Sy zfs_multihost_fail_intervals Ns = Ns Sy 10 Pq uint
 Controls the behavior of the pool when multihost write failures or delays are
 detected.
 .Pp
 When
 .Sy 0 ,
 multihost write failures or delays are ignored.
 The failures will still be reported to the ZED which depending on
 its configuration may take action such as suspending the pool or offlining a
 device.
 .Pp
 Otherwise, the pool will be suspended if
 .Sy zfs_multihost_fail_intervals No \(mu Sy zfs_multihost_interval
 milliseconds pass without a successful MMP write.
 This guarantees the activity test will see MMP writes if the pool is imported.
 .Sy 1 No is equivalent to Sy 2 ;
 this is necessary to prevent the pool from being suspended
 due to normal, small I/O latency variations.
 .
 .It Sy zfs_no_scrub_io Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Set to disable scrub I/O.
 This results in scrubs not actually scrubbing data and
 simply doing a metadata crawl of the pool instead.
 .
 .It Sy zfs_no_scrub_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Set to disable block prefetching for scrubs.
 .
 .It Sy zfs_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable cache flush operations on disks when writing.
 Setting this will cause pool corruption on power loss
 if a volatile out-of-order write cache is enabled.
 .
 .It Sy zfs_nopwrite_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Allow no-operation writes.
 The occurrence of nopwrites will further depend on other pool properties
 .Pq i.a. the checksumming and compression algorithms .
 .
 .It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable forcing TXG sync to find holes.
 When enabled forces ZFS to sync data when
 .Sy SEEK_HOLE No or Sy SEEK_DATA
 flags are used allowing holes in a file to be accurately reported.
 When disabled holes will not be reported in recently dirtied files.
 .
 .It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50 MiB Pc Pq int
 The number of bytes which should be prefetched during a pool traversal, like
 .Nm zfs Cm send
 or other data crawling operations.
 .
 .It Sy zfs_traverse_indirect_prefetch_limit Ns = Ns Sy 32 Pq uint
 The number of blocks pointed by indirect (non-L0) block which should be
 prefetched during a pool traversal, like
 .Nm zfs Cm send
 or other data crawling operations.
 .
 .It Sy zfs_per_txg_dirty_frees_percent Ns = Ns Sy 30 Ns % Pq u64
 Control percentage of dirtied indirect blocks from frees allowed into one TXG.
 After this threshold is crossed, additional frees will wait until the next TXG.
 .Sy 0 No disables this throttle .
 .
 .It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable predictive prefetch.
 Note that it leaves "prescient" prefetch
 .Pq for, e.g., Nm zfs Cm send
 intact.
 Unlike predictive prefetch, prescient prefetch never issues I/O
 that ends up not being needed, so it can't hurt performance.
 .
 .It Sy zfs_qat_checksum_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable QAT hardware acceleration for SHA256 checksums.
 May be unset after the ZFS modules have been loaded to initialize the QAT
 hardware as long as support is compiled in and the QAT driver is present.
 .
 .It Sy zfs_qat_compress_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable QAT hardware acceleration for gzip compression.
 May be unset after the ZFS modules have been loaded to initialize the QAT
 hardware as long as support is compiled in and the QAT driver is present.
 .
 .It Sy zfs_qat_encrypt_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable QAT hardware acceleration for AES-GCM encryption.
 May be unset after the ZFS modules have been loaded to initialize the QAT
 hardware as long as support is compiled in and the QAT driver is present.
 .
 .It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 Bytes to read per chunk.
 .
 .It Sy zfs_read_history Ns = Ns Sy 0 Pq uint
 Historical statistics for this many latest reads will be available in
 .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /reads .
 .
 .It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Include cache hits in read history
 .
 .It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 Maximum read segment size to issue when sequentially resilvering a
 top-level vdev.
 .
 .It Sy zfs_rebuild_scrub_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Automatically start a pool scrub when the last active sequential resilver
 completes in order to verify the checksums of all blocks which have been
 resilvered.
 This is enabled by default and strongly recommended.
 .
 .It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Maximum amount of I/O that can be concurrently issued for a sequential
 resilver per leaf device, given in bytes.
 .
 .It Sy zfs_reconstruct_indirect_combinations_max Ns = Ns Sy 4096 Pq int
 If an indirect split block contains more than this many possible unique
 combinations when being reconstructed, consider it too computationally
 expensive to check them all.
 Instead, try at most this many randomly selected
 combinations each time the block is accessed.
 This allows all segment copies to participate fairly
 in the reconstruction when all combinations
 cannot be checked and prevents repeated use of one bad copy.
 .
 .It Sy zfs_recover Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Set to attempt to recover from fatal errors.
 This should only be used as a last resort,
 as it typically results in leaked space, or worse.
 .
 .It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Ignore hard I/O errors during device removal.
 When set, if a device encounters a hard I/O error during the removal process
 the removal will not be cancelled.
 This can result in a normally recoverable block becoming permanently damaged
 and is hence not recommended.
 This should only be used as a last resort when the
 pool cannot be returned to a healthy state prior to removing the device.
 .
 .It Sy zfs_removal_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 This is used by the test suite so that it can ensure that certain actions
 happen while in the middle of a removal.
 .
 .It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 The largest contiguous segment that we will attempt to allocate when removing
 a device.
 If there is a performance problem with attempting to allocate large blocks,
 consider decreasing this.
 The default value is also the maximum.
 .
 .It Sy zfs_resilver_disable_defer Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Ignore the
 .Sy resilver_defer
 feature, causing an operation that would start a resilver to
 immediately restart the one in progress.
 .
 .It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq uint
 Resilvers are processed by the sync thread.
 While resilvering, it will spend at least this much time
 working on a resilver between TXG flushes.
 .
 .It Sy zfs_scan_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If set, remove the DTL (dirty time list) upon completion of a pool scan (scrub),
 even if there were unrepairable errors.
 Intended to be used during pool repair or recovery to
 stop resilvering when the pool is next imported.
 .
 .It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint
 Scrubs are processed by the sync thread.
 While scrubbing, it will spend at least this much time
 working on a scrub between TXG flushes.
 .
 .It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint
 Error blocks to be scrubbed in one txg.
 .
 .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint
 To preserve progress across reboots, the sequential scan algorithm periodically
 needs to stop metadata scanning and issue all the verification I/O to disk.
 The frequency of this flushing is determined by this tunable.
 .
 .It Sy zfs_scan_fill_weight Ns = Ns Sy 3 Pq uint
 This tunable affects how scrub and resilver I/O segments are ordered.
 A higher number indicates that we care more about how filled in a segment is,
 while a lower number indicates we care more about the size of the extent without
 considering the gaps within a segment.
 This value is only tunable upon module insertion.
 Changing the value afterwards will have no effect on scrub or resilver
 performance.
 .
 .It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq uint
 Determines the order that data will be verified while scrubbing or resilvering:
 .Bl -tag -compact -offset 4n -width "a"
 .It Sy 1
 Data will be verified as sequentially as possible, given the
 amount of memory reserved for scrubbing
 .Pq see Sy zfs_scan_mem_lim_fact .
 This may improve scrub performance if the pool's data is very fragmented.
 .It Sy 2
 The largest mostly-contiguous chunk of found data will be verified first.
 By deferring scrubbing of small segments, we may later find adjacent data
 to coalesce and increase the segment size.
 .It Sy 0
 .No Use strategy Sy 1 No during normal verification
 .No and strategy Sy 2 No while taking a checkpoint .
 .El
 .
 .It Sy zfs_scan_legacy Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If unset, indicates that scrubs and resilvers will gather metadata in
 memory before issuing sequential I/O.
 Otherwise indicates that the legacy algorithm will be used,
 where I/O is initiated as soon as it is discovered.
 Unsetting will not affect scrubs or resilvers that are already in progress.
 .
 .It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq int
 Sets the largest gap in bytes between scrub/resilver I/O operations
 that will still be considered sequential for sorting purposes.
 Changing this value will not
 affect scrubs or resilvers that are already in progress.
 .
 .It Sy zfs_scan_mem_lim_fact Ns = Ns Sy 20 Ns ^-1 Pq uint
 Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
 This tunable determines the hard limit for I/O sorting memory usage.
 When the hard limit is reached we stop scanning metadata and start issuing
 data verification I/O.
 This is done until we get below the soft limit.
 .
 .It Sy zfs_scan_mem_lim_soft_fact Ns = Ns Sy 20 Ns ^-1 Pq uint
 The fraction of the hard limit used to determined the soft limit for I/O sorting
 by the sequential scan algorithm.
 When we cross this limit from below no action is taken.
 When we cross this limit from above it is because we are issuing verification
 I/O.
 In this case (unless the metadata scan is done) we stop issuing verification I/O
 and start scanning metadata again until we get to the hard limit.
 .
 .It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When reporting resilver throughput and estimated completion time use the
 performance observed over roughly the last
 .Sy zfs_scan_report_txgs
 TXGs.
 When set to zero performance is calculated over the time between checkpoints.
 .
 .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enforce tight memory limits on pool scans when a sequential scan is in progress.
 When disabled, the memory limit may be exceeded by fast disks.
 .
 .It Sy zfs_scan_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Freezes a scrub/resilver in progress without actually pausing it.
 Intended for testing/debugging.
 .
 .It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int
 Maximum amount of data that can be concurrently issued at once for scrubs and
 resilvers per leaf device, given in bytes.
 .
 .It Sy zfs_send_corrupt_data Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Allow sending of corrupt data (ignore read/checksum errors when sending).
 .
 .It Sy zfs_send_unmodified_spill_blocks Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Include unmodified spill blocks in the send stream.
 Under certain circumstances, previous versions of ZFS could incorrectly
 remove the spill block from an existing object.
 Including unmodified copies of the spill blocks creates a backwards-compatible
 stream which will recreate a spill block if it was incorrectly removed.
 .
 .It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint
 The fill fraction of the
 .Nm zfs Cm send
 internal queues.
 The fill fraction controls the timing with which internal threads are woken up.
 .
 .It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 The maximum number of bytes allowed in
 .Nm zfs Cm send Ns 's
 internal queues.
 .
 .It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint
 The fill fraction of the
 .Nm zfs Cm send
 prefetch queue.
 The fill fraction controls the timing with which internal threads are woken up.
 .
 .It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 The maximum number of bytes allowed that will be prefetched by
 .Nm zfs Cm send .
 This value must be at least twice the maximum block size in use.
 .
 .It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint
 The fill fraction of the
 .Nm zfs Cm receive
 queue.
 The fill fraction controls the timing with which internal threads are woken up.
 .
 .It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 The maximum number of bytes allowed in the
 .Nm zfs Cm receive
 queue.
 This value must be at least twice the maximum block size in use.
 .
 .It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 The maximum amount of data, in bytes, that
 .Nm zfs Cm receive
 will write in one DMU transaction.
 This is the uncompressed size, even when receiving a compressed send stream.
 This setting will not reduce the write size below a single block.
 Capped at a maximum of
 .Sy 32 MiB .
 .
 .It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int
 When this variable is set to non-zero a corrective receive:
 .Bl -enum -compact -offset 4n -width "1."
 .It
 Does not enforce the restriction of source & destination snapshot GUIDs
 matching.
 .It
 If there is an error during healing, the healing receive is not
 terminated instead it moves on to the next record.
 .El
 .
 .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Setting this variable overrides the default logic for estimating block
 sizes when doing a
 .Nm zfs Cm send .
 The default heuristic is that the average block size
 will be the current recordsize.
 Override this value if most data in your dataset is not of that size
 and you require accurate zfs send size estimates.
 .
 .It Sy zfs_sync_pass_deferred_free Ns = Ns Sy 2 Pq uint
 Flushing of data to disk is done in passes.
 Defer frees starting in this pass.
 .
 .It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int
 Maximum memory used for prefetching a checkpoint's space map on each
 vdev while discarding the checkpoint.
 .
 .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint
 Only allow small data blocks to be allocated on the special and dedup vdev
 types when the available free space percentage on these vdevs exceeds this
 value.
 This ensures reserved space is available for pool metadata as the
 special vdevs approach capacity.
 .
 .It Sy zfs_sync_pass_dont_compress Ns = Ns Sy 8 Pq uint
 Starting in this sync pass, disable compression (including of metadata).
 With the default setting, in practice, we don't have this many sync passes,
 so this has no effect.
 .Pp
 The original intent was that disabling compression would help the sync passes
 to converge.
 However, in practice, disabling compression increases
 the average number of sync passes; because when we turn compression off,
 many blocks' size will change, and thus we have to re-allocate
 (not overwrite) them.
 It also increases the number of
 .Em 128 KiB
 allocations (e.g. for indirect blocks and spacemaps)
 because these will not be compressed.
 The
 .Em 128 KiB
 allocations are especially detrimental to performance
 on highly fragmented systems, which may have very few free segments of this
 size,
 and may need to load new metaslabs to satisfy these allocations.
 .
 .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
 Rewrite new block pointers starting in this pass.
 .
 .It Sy zfs_sync_taskq_batch_pct Ns = Ns Sy 75 Ns % Pq int
 This controls the number of threads used by
 .Sy dp_sync_taskq .
 The default value of
 .Sy 75%
 will create a maximum of one thread per CPU.
 .
 .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Maximum size of TRIM command.
 Larger ranges will be split into chunks no larger than this value before
 issuing.
 .
 .It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 Minimum size of TRIM commands.
 TRIM ranges smaller than this will be skipped,
 unless they're part of a larger range which was chunked.
 This is done because it's common for these small TRIMs
 to negatively impact overall performance.
 .
 .It Sy zfs_trim_metaslab_skip Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Skip uninitialized metaslabs during the TRIM process.
 This option is useful for pools constructed from large thinly-provisioned
 devices
 where TRIM operations are slow.
 As a pool ages, an increasing fraction of the pool's metaslabs
 will be initialized, progressively degrading the usefulness of this option.
 This setting is stored when starting a manual TRIM and will
 persist for the duration of the requested TRIM.
 .
 .It Sy zfs_trim_queue_limit Ns = Ns Sy 10 Pq uint
 Maximum number of queued TRIMs outstanding per leaf vdev.
 The number of concurrent TRIM commands issued to the device is controlled by
 .Sy zfs_vdev_trim_min_active No and Sy zfs_vdev_trim_max_active .
 .
 .It Sy zfs_trim_txg_batch Ns = Ns Sy 32 Pq uint
 The number of transaction groups' worth of frees which should be aggregated
 before TRIM operations are issued to the device.
 This setting represents a trade-off between issuing larger,
 more efficient TRIM operations and the delay
 before the recently trimmed space is available for use by the device.
 .Pp
 Increasing this value will allow frees to be aggregated for a longer time.
 This will result is larger TRIM operations and potentially increased memory
 usage.
 Decreasing this value will have the opposite effect.
 The default of
 .Sy 32
 was determined to be a reasonable compromise.
 .
 .It Sy zfs_txg_history Ns = Ns Sy 0 Pq uint
 Historical statistics for this many latest TXGs will be available in
 .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /TXGs .
 .
 .It Sy zfs_txg_timeout Ns = Ns Sy 5 Ns s Pq uint
 Flush dirty data to disk at least every this many seconds (maximum TXG
 duration).
 .
 .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 Max vdev I/O aggregation size.
 .
 .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
 Max vdev I/O aggregation size for non-rotating media.
 .
 .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O operation
 immediately follows its predecessor on rotational vdevs
 for the purpose of making decisions based on load.
 .
 .It Sy zfs_vdev_mirror_rotating_seek_inc Ns = Ns Sy 5 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O operation
 lacks locality as defined by
 .Sy zfs_vdev_mirror_rotating_seek_offset .
 Operations within this that are not immediately following the previous operation
 are incremented by half.
 .
 .It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int
 The maximum distance for the last queued I/O operation in which
 the balancing algorithm considers an operation to have locality.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_mirror_non_rotating_inc Ns = Ns Sy 0 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member on non-rotational vdevs
 when I/O operations do not immediately follow one another.
 .
 .It Sy zfs_vdev_mirror_non_rotating_seek_inc Ns = Ns Sy 1 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O operation
 lacks
 locality as defined by the
 .Sy zfs_vdev_mirror_rotating_seek_offset .
 Operations within this that are not immediately following the previous operation
 are incremented by half.
 .
 .It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 Aggregate read I/O operations if the on-disk gap between them is within this
 threshold.
 .
 .It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4 KiB Pc Pq uint
 Aggregate write I/O operations if the on-disk gap between them is within this
 threshold.
 .
 .It Sy zfs_vdev_raidz_impl Ns = Ns Sy fastest Pq string
 Select the raidz parity implementation to use.
 .Pp
 Variants that don't depend on CPU-specific features
 may be selected on module load, as they are supported on all systems.
 The remaining options may only be set after the module is loaded,
 as they are available only if the implementations are compiled in
 and supported on the running system.
 .Pp
 Once the module is loaded,
 .Pa /sys/module/zfs/parameters/zfs_vdev_raidz_impl
 will show the available options,
 with the currently selected one enclosed in square brackets.
 .Pp
 .TS
 lb l l .
 fastest	selected by built-in benchmark
 original	original implementation
 scalar	scalar implementation
 sse2	SSE2 instruction set	64-bit x86
 ssse3	SSSE3 instruction set	64-bit x86
 avx2	AVX2 instruction set	64-bit x86
 avx512f	AVX512F instruction set	64-bit x86
 avx512bw	AVX512F & AVX512BW instruction sets	64-bit x86
 aarch64_neon	NEON	Aarch64/64-bit ARMv8
 aarch64_neonx2	NEON with more unrolling	Aarch64/64-bit ARMv8
 powerpc_altivec	Altivec	PowerPC
 .TE
 .
 .It Sy zfs_vdev_scheduler Pq charp
 .Sy DEPRECATED .
 Prints warning to kernel log for compatibility.
 .
 .It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq uint
 Max event queue length.
 Events in the queue can be viewed with
 .Xr zpool-events 8 .
 .
 .It Sy zfs_zevent_retain_max Ns = Ns Sy 2000 Pq int
 Maximum recent zevent records to retain for duplicate checking.
 Setting this to
 .Sy 0
 disables duplicate detection.
 .
 .It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15 min Pc Pq int
 Lifespan for a recent ereport that was retained for duplicate checking.
 .
 .It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int
 The maximum number of taskq entries that are allowed to be cached.
 When this limit is exceeded transaction records (itxs)
 will be cleaned synchronously.
 .
 .It Sy zfs_zil_clean_taskq_minalloc Ns = Ns Sy 1024 Pq int
 The number of taskq entries that are pre-populated when the taskq is first
 created and are immediately available for use.
 .
 .It Sy zfs_zil_clean_taskq_nthr_pct Ns = Ns Sy 100 Ns % Pq int
 This controls the number of threads used by
 .Sy dp_zil_clean_taskq .
 The default value of
 .Sy 100%
 will create a maximum of one thread per cpu.
 .
 .It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
 This sets the maximum block size used by the ZIL.
 On very fragmented pools, lowering this
 .Pq typically to Sy 36 KiB
 can improve performance.
 .
 .It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint
 This sets the maximum number of write bytes logged via WR_COPIED.
 It tunes a tradeoff between additional memory copy and possibly worse log
 space efficiency vs additional range lock/unlock.
 .
 .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable the cache flush commands that are normally sent to disk by
 the ZIL after an LWB write has completed.
 Setting this will cause ZIL corruption on power loss
 if a volatile out-of-order write cache is enabled.
 .
 .It Sy zil_replay_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable intent logging replay.
 Can be disabled for recovery from corrupted ZIL.
 .
 .It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Limit SLOG write size per commit executed with synchronous priority.
 Any writes above that will be executed with lower (asynchronous) priority
 to limit potential SLOG device abuse by single active ZIL writer.
 .
 .It Sy zfs_zil_saxattr Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Setting this tunable to zero disables ZIL logging of new
 .Sy xattr Ns = Ns Sy sa
 records if the
 .Sy org.openzfs:zilsaxattr
 feature is enabled on the pool.
 This would only be necessary to work around bugs in the ZIL logging or replay
 code for this record type.
 The tunable has no effect if the feature is disabled.
 .
 .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint
 Usually, one metaslab from each normal-class vdev is dedicated for use by
 the ZIL to log synchronous writes.
 However, if there are fewer than
 .Sy zfs_embedded_slog_min_ms
 metaslabs in the vdev, this functionality is disabled.
 This ensures that we don't set aside an unreasonable amount of space for the
 ZIL.
 .
 .It Sy zstd_earlyabort_pass Ns = Ns Sy 1 Pq uint
 Whether heuristic for detection of incompressible data with zstd levels >= 3
 using LZ4 and zstd-1 passes is enabled.
 .
 .It Sy zstd_abort_size Ns = Ns Sy 131072 Pq uint
 Minimal uncompressed size (inclusive) of a record before the early abort
 heuristic will be attempted.
 .
 .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If non-zero, the zio deadman will produce debugging messages
 .Pq see Sy zfs_dbgmsg_enable
 for all zios, rather than only for leaf zios possessing a vdev.
 This is meant to be used by developers to gain
 diagnostic information for hang conditions which don't involve a mutex
 or other locking primitive: typically conditions in which a thread in
 the zio pipeline is looping indefinitely.
 .
 .It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30 s Pc Pq int
 When an I/O operation takes more than this much time to complete,
 it's marked as slow.
 Each slow operation causes a delay zevent.
 Slow I/O counters can be seen with
 .Nm zpool Cm status Fl s .
 .
 .It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Throttle block allocations in the I/O pipeline.
 This allows for dynamic allocation distribution when devices are imbalanced.
 When enabled, the maximum number of pending allocations per top-level vdev
 is limited by
 .Sy zfs_vdev_queue_depth_pct .
 .
 .It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int
 Control the naming scheme used when setting new xattrs in the user namespace.
 If
 .Sy 0
 .Pq the default on Linux ,
 user namespace xattr names are prefixed with the namespace, to be backwards
 compatible with previous versions of ZFS on Linux.
 If
 .Sy 1
 .Pq the default on Fx ,
 user namespace xattr names are not prefixed, to be backwards compatible with
 previous versions of ZFS on illumos and
 .Fx .
 .Pp
 Either naming scheme can be read on this and future versions of ZFS, regardless
 of this tunable, but legacy ZFS on illumos or
 .Fx
 are unable to read user namespace xattrs written in the Linux format, and
 legacy versions of ZFS on Linux are unable to read user namespace xattrs written
 in the legacy ZFS format.
 .Pp
 An existing xattr with the alternate naming scheme is removed when overwriting
 the xattr so as to not accumulate duplicates.
 .
 .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prioritize requeued I/O.
 .
 .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
 Percentage of online CPUs which will run a worker thread for I/O.
 These workers are responsible for I/O work such as compression, encryption,
 checksum and parity calculations.
 Fractional number of CPUs will be rounded down.
 .Pp
 The default value of
 .Sy 80%
 was chosen to avoid using all CPUs which can result in
 latency issues and inconsistent application performance,
 especially when slower compression and/or checksumming is enabled.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
 Number of worker threads per taskq.
 Lower values improve I/O ordering and CPU utilization,
 while higher reduces lock contention.
 .Pp
 If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
 Set the queue and thread configuration for the IO read queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
 Set values only apply to pools imported/created after that.
 .
 .It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp
 Set the queue and thread configuration for the IO write queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
 Set values only apply to pools imported/created after that.
 .
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
 This may slightly improve startup time on
 systems with a very large number of zvols.
 .
 .It Sy zvol_major Ns = Ns Sy 230 Pq uint
 Major number for zvol block devices.
 .
 .It Sy zvol_max_discard_blocks Ns = Ns Sy 16384 Pq long
 Discard (TRIM) operations done on zvols will be done in batches of this
 many blocks, where block size is determined by the
 .Sy volblocksize
 property of a zvol.
 .
 .It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
 When adding a zvol to the system, prefetch this many bytes
 from the start and end of the volume.
 Prefetching these regions of the volume is desirable,
 because they are likely to be accessed immediately by
 .Xr blkid 8
 or the kernel partitioner.
 .
 .It Sy zvol_request_sync Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When processing I/O requests for a zvol, submit them synchronously.
 This effectively limits the queue depth to
 .Em 1
 for each I/O submitter.
 When unset, requests are handled asynchronously by a thread pool.
 The number of requests which can be handled concurrently is controlled by
 .Sy zvol_threads .
 .Sy zvol_request_sync
 is ignored when running on a kernel that supports block multiqueue
 .Pq Li blk-mq .
 .
 .It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
 Number of zvol taskqs.
 If
 .Sy 0
 (the default) then scaling is done internally to prefer 6 threads per taskq.
 This only applies on Linux.
 .
 .It Sy zvol_threads Ns = Ns Sy 0 Pq uint
 The number of system wide threads to use for processing zvol block IOs.
 If
 .Sy 0
 (the default) then internally set
 .Sy zvol_threads
 to the number of CPUs present or 32 (whichever is greater).
 .
 .It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
 The number of threads per zvol to use for queuing IO requests.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only read and assigned to a zvol at zvol load time.
 If
 .Sy 0
 (the default) then internally set
 .Sy zvol_blk_mq_threads
 to the number of CPUs present.
 .
 .It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Set to
 .Sy 1
 to use the
 .Li blk-mq
 API for zvols.
 Set to
 .Sy 0
 (the default) to use the legacy zvol APIs.
 This setting can give better or worse zvol performance depending on
 the workload.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only read and assigned to a zvol at zvol load time.
 .
 .It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
 If
 .Sy zvol_use_blk_mq
 is enabled, then process this number of
 .Sy volblocksize Ns -sized blocks per zvol thread.
 This tunable can be use to favor better performance for zvol reads (lower
 values) or writes (higher values).
 If set to
 .Sy 0 ,
 then the zvol layer will process the maximum number of blocks
 per thread that it can.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only applied at each zvol's load time.
 .
 .It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
 The queue_depth value for the zvol
 .Li blk-mq
 interface.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only applied at each zvol's load time.
 If
 .Sy 0
 (the default) then use the kernel's default queue depth.
 Values are clamped to the kernel's
 .Dv BLKDEV_MIN_RQ
 and
 .Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
 limits.
 .
 .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
 Defines zvol block devices behaviour when
 .Sy volmode Ns = Ns Sy default :
 .Bl -tag -compact -offset 4n -width "a"
 .It Sy 1
 .No equivalent to Sy full
 .It Sy 2
 .No equivalent to Sy dev
 .It Sy 3
 .No equivalent to Sy none
 .El
 .
 .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Enable strict ZVOL quota enforcement.
 The strict quota enforcement may have a performance impact.
 .El
 .
 .Sh ZFS I/O SCHEDULER
 ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations.
 The scheduler determines when and in what order those operations are issued.
 The scheduler divides operations into five I/O classes,
 prioritized in the following order: sync read, sync write, async read,
 async write, and scrub/resilver.
 Each queue defines the minimum and maximum number of concurrent operations
 that may be issued to the device.
 In addition, the device has an aggregate maximum,
 .Sy zfs_vdev_max_active .
 Note that the sum of the per-queue minima must not exceed the aggregate maximum.
 If the sum of the per-queue maxima exceeds the aggregate maximum,
 then the number of active operations may reach
 .Sy zfs_vdev_max_active ,
 in which case no further operations will be issued,
 regardless of whether all per-queue minima have been met.
 .Pp
 For many physical devices, throughput increases with the number of
 concurrent operations, but latency typically suffers.
 Furthermore, physical devices typically have a limit
 at which more concurrent operations have no
 effect on throughput or can actually cause it to decrease.
 .Pp
 The scheduler selects the next operation to issue by first looking for an
 I/O class whose minimum has not been satisfied.
 Once all are satisfied and the aggregate maximum has not been hit,
 the scheduler looks for classes whose maximum has not been satisfied.
 Iteration through the I/O classes is done in the order specified above.
 No further operations are issued
 if the aggregate maximum number of concurrent operations has been hit,
 or if there are no operations queued for an I/O class that has not hit its
 maximum.
 Every time an I/O operation is queued or an operation completes,
 the scheduler looks for new operations to issue.
 .Pp
 In general, smaller
 .Sy max_active Ns s
 will lead to lower latency of synchronous operations.
 Larger
 .Sy max_active Ns s
 may lead to higher overall throughput, depending on underlying storage.
 .Pp
 The ratio of the queues'
 .Sy max_active Ns s
 determines the balance of performance between reads, writes, and scrubs.
 For example, increasing
 .Sy zfs_vdev_scrub_max_active
 will cause the scrub or resilver to complete more quickly,
 but reads and writes to have higher latency and lower throughput.
 .Pp
 All I/O classes have a fixed maximum number of outstanding operations,
 except for the async write class.
 Asynchronous writes represent the data that is committed to stable storage
 during the syncing stage for transaction groups.
 Transaction groups enter the syncing state periodically,
 so the number of queued async writes will quickly burst up
 and then bleed down to zero.
 Rather than servicing them as quickly as possible,
 the I/O scheduler changes the maximum number of active async write operations
 according to the amount of dirty data in the pool.
 Since both throughput and latency typically increase with the number of
 concurrent operations issued to physical devices, reducing the
 burstiness in the number of simultaneous operations also stabilizes the
 response time of operations from other queues, in particular synchronous ones.
 In broad strokes, the I/O scheduler will issue more concurrent operations
 from the async write queue as there is more dirty data in the pool.
 .
 .Ss Async Writes
 The number of concurrent operations issued for the async write I/O class
 follows a piece-wise linear function defined by a few adjustable points:
 .Bd -literal
        |              o---------| <-- \fBzfs_vdev_async_write_max_active\fP
   ^    |             /^         |
   |    |            / |         |
 active |           /  |         |
  I/O   |          /   |         |
 count  |         /    |         |
        |        /     |         |
        |-------o      |         | <-- \fBzfs_vdev_async_write_min_active\fP
       0|_______^______|_________|
        0%      |      |       100% of \fBzfs_dirty_data_max\fP
                |      |
                |      `-- \fBzfs_vdev_async_write_active_max_dirty_percent\fP
                `--------- \fBzfs_vdev_async_write_active_min_dirty_percent\fP
 .Ed
 .Pp
 Until the amount of dirty data exceeds a minimum percentage of the dirty
 data allowed in the pool, the I/O scheduler will limit the number of
 concurrent operations to the minimum.
 As that threshold is crossed, the number of concurrent operations issued
 increases linearly to the maximum at the specified maximum percentage
 of the dirty data allowed in the pool.
 .Pp
 Ideally, the amount of dirty data on a busy pool will stay in the sloped
 part of the function between
 .Sy zfs_vdev_async_write_active_min_dirty_percent
 and
 .Sy zfs_vdev_async_write_active_max_dirty_percent .
 If it exceeds the maximum percentage,
 this indicates that the rate of incoming data is
 greater than the rate that the backend storage can handle.
 In this case, we must further throttle incoming writes,
 as described in the next section.
 .
 .Sh ZFS TRANSACTION DELAY
 We delay transactions when we've determined that the backend storage
 isn't able to accommodate the rate of incoming writes.
 .Pp
 If there is already a transaction waiting, we delay relative to when
 that transaction will finish waiting.
 This way the calculated delay time
 is independent of the number of threads concurrently executing transactions.
 .Pp
 If we are the only waiter, wait relative to when the transaction started,
 rather than the current time.
 This credits the transaction for "time already served",
 e.g. reading indirect blocks.
 .Pp
 The minimum time for a transaction to take is calculated as
 .D1 min_time = min( Ns Sy zfs_delay_scale No \(mu Po Sy dirty No \- Sy min Pc / Po Sy max No \- Sy dirty Pc , 100ms)
 .Pp
 The delay has two degrees of freedom that can be adjusted via tunables.
 The percentage of dirty data at which we start to delay is defined by
 .Sy zfs_delay_min_dirty_percent .
 This should typically be at or above
 .Sy zfs_vdev_async_write_active_max_dirty_percent ,
 so that we only start to delay after writing at full speed
 has failed to keep up with the incoming write rate.
 The scale of the curve is defined by
 .Sy zfs_delay_scale .
 Roughly speaking, this variable determines the amount of delay at the midpoint
 of the curve.
 .Bd -literal
 delay
  10ms +-------------------------------------------------------------*+
       |                                                             *|
   9ms +                                                             *+
       |                                                             *|
   8ms +                                                             *+
       |                                                            * |
   7ms +                                                            * +
       |                                                            * |
   6ms +                                                            * +
       |                                                            * |
   5ms +                                                           *  +
       |                                                           *  |
   4ms +                                                           *  +
       |                                                           *  |
   3ms +                                                          *   +
       |                                                          *   |
   2ms +                                              (midpoint) *    +
       |                                                  |    **     |
   1ms +                                                  v ***       +
       |             \fBzfs_delay_scale\fP ---------->     ********         |
     0 +-------------------------------------*********----------------+
       0%                    <- \fBzfs_dirty_data_max\fP ->               100%
 .Ed
 .Pp
 Note, that since the delay is added to the outstanding time remaining on the
 most recent transaction it's effectively the inverse of IOPS.
 Here, the midpoint of
 .Em 500 us
 translates to
 .Em 2000 IOPS .
 The shape of the curve
 was chosen such that small changes in the amount of accumulated dirty data
 in the first three quarters of the curve yield relatively small differences
 in the amount of delay.
 .Pp
 The effects can be easier to understand when the amount of delay is
 represented on a logarithmic scale:
 .Bd -literal
 delay
 100ms +-------------------------------------------------------------++
       +                                                              +
       |                                                              |
       +                                                             *+
  10ms +                                                             *+
       +                                                           ** +
       |                                              (midpoint)  **  |
       +                                                  |     **    +
   1ms +                                                  v ****      +
       +             \fBzfs_delay_scale\fP ---------->        *****         +
       |                                             ****             |
       +                                          ****                +
 100us +                                        **                    +
       +                                       *                      +
       |                                      *                       |
       +                                     *                        +
  10us +                                     *                        +
       +                                                              +
       |                                                              |
       +                                                              +
       +--------------------------------------------------------------+
       0%                    <- \fBzfs_dirty_data_max\fP ->               100%
 .Ed
 .Pp
 Note here that only as the amount of dirty data approaches its limit does
 the delay start to increase rapidly.
 The goal of a properly tuned system should be to keep the amount of dirty data
 out of that range by first ensuring that the appropriate limits are set
 for the I/O scheduler to reach optimal throughput on the back-end storage,
 and then by changing the value of
 .Sy zfs_delay_scale
 to increase the steepness of the curve.
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
index 87bd17cf5a6d..3d980819291c 100644
--- a/sys/contrib/openzfs/module/Kbuild.in
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -1,505 +1,505 @@
 # When integrated in to a monolithic kernel the spl module must appear
 # first.  This ensures its module initialization function is run before
 # any of the other module initialization functions which depend on it.
 
 ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement
 ZFS_MODULE_CFLAGS += -Wmissing-prototypes
 ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@  @NO_FORMAT_ZERO_LENGTH@
 
 ifneq ($(KBUILD_EXTMOD),)
 zfs_include = @abs_top_srcdir@/include
 icp_include = @abs_srcdir@/icp/include
 zstd_include = @abs_srcdir@/zstd/include
 ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h
 ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include
 src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
-icp_include = $(srctree)/$(src)/icp/include
-zstd_include = $(srctree)/$(src)/zstd/include
+icp_include = $(src)/icp/include
+zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif
 
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
 ZFS_MODULE_CFLAGS += -I$(zfs_include)
 ZFS_MODULE_CPPFLAGS += -D_KERNEL
 ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
 
 # KASAN enables -Werror=frame-larger-than=1024, which
 # breaks oh so many parts of our build.
 ifeq ($(CONFIG_KASAN),y)
 ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
 endif
 
 # Generated binary search code is particularly bad with this optimization.
 # Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
 # is not affected when unrolling is done.
 # Disable it until the following upstream issue is resolved:
 # https://github.com/llvm/llvm-project/issues/62790
 ifeq ($(CONFIG_X86),y)
 ifeq ($(CONFIG_CC_IS_CLANG),y)
 CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
 CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
 CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false
 CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false
 endif
 endif
 
 ifneq ($(KBUILD_EXTMOD),)
 @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
 @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
 endif
 
 asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
 ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
 
 ifeq ($(CONFIG_ARM64),y)
 CFLAGS_REMOVE_zcommon/zfs_fletcher_aarch64_neon.o += -mgeneral-regs-only
 CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only
 CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only
 endif
 
 # Suppress unused-value warnings in sparc64 architecture headers
 ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
 
 
 obj-$(CONFIG_ZFS) := spl.o zfs.o
 
 SPL_OBJS := \
 	spl-atomic.o \
 	spl-condvar.o \
 	spl-cred.o \
 	spl-err.o \
 	spl-generic.o \
 	spl-kmem-cache.o \
 	spl-kmem.o \
 	spl-kstat.o \
 	spl-proc.o \
 	spl-procfs-list.o \
 	spl-shrinker.o \
 	spl-taskq.o \
 	spl-thread.o \
 	spl-trace.o \
 	spl-tsd.o \
 	spl-vmem.o \
 	spl-xdr.o \
 	spl-zlib.o \
 	spl-zone.o
 
 spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS))
 
 zfs-objs += avl/avl.o
 
 ICP_OBJS := \
 	algs/aes/aes_impl.o \
 	algs/aes/aes_impl_generic.o \
 	algs/aes/aes_modes.o \
 	algs/blake3/blake3.o \
 	algs/blake3/blake3_generic.o \
 	algs/blake3/blake3_impl.o \
 	algs/edonr/edonr.o \
 	algs/modes/cbc.o \
 	algs/modes/ccm.o \
 	algs/modes/ctr.o \
 	algs/modes/ecb.o \
 	algs/modes/gcm.o \
 	algs/modes/gcm_generic.o \
 	algs/modes/modes.o \
 	algs/sha2/sha2_generic.o \
 	algs/sha2/sha256_impl.o \
 	algs/sha2/sha512_impl.o \
 	algs/skein/skein.o \
 	algs/skein/skein_block.o \
 	algs/skein/skein_iv.o \
 	api/kcf_cipher.o \
 	api/kcf_ctxops.o \
 	api/kcf_mac.o \
 	core/kcf_callprov.o \
 	core/kcf_mech_tabs.o \
 	core/kcf_prov_lib.o \
 	core/kcf_prov_tabs.o \
 	core/kcf_sched.o \
 	illumos-crypto.o \
 	io/aes.o \
 	io/sha2_mod.o \
 	io/skein_mod.o \
 	spi/kcf_spi.o
 
 ICP_OBJS_X86_64 := \
 	asm-x86_64/aes/aes_aesni.o \
 	asm-x86_64/aes/aes_amd64.o \
 	asm-x86_64/aes/aeskey.o \
 	asm-x86_64/blake3/blake3_avx2.o \
 	asm-x86_64/blake3/blake3_avx512.o \
 	asm-x86_64/blake3/blake3_sse2.o \
 	asm-x86_64/blake3/blake3_sse41.o \
 	asm-x86_64/sha2/sha256-x86_64.o \
 	asm-x86_64/sha2/sha512-x86_64.o \
 	asm-x86_64/modes/aesni-gcm-x86_64.o \
 	asm-x86_64/modes/gcm_pclmulqdq.o \
 	asm-x86_64/modes/ghash-x86_64.o
 
 ICP_OBJS_X86 := \
 	algs/aes/aes_impl_aesni.o \
 	algs/aes/aes_impl_x86-64.o \
 	algs/modes/gcm_pclmulqdq.o
 
 ICP_OBJS_ARM := \
 	asm-arm/sha2/sha256-armv7.o \
 	asm-arm/sha2/sha512-armv7.o
 
 ICP_OBJS_ARM64 := \
 	asm-aarch64/blake3/b3_aarch64_sse2.o \
 	asm-aarch64/blake3/b3_aarch64_sse41.o \
 	asm-aarch64/sha2/sha256-armv8.o \
 	asm-aarch64/sha2/sha512-armv8.o
 
 ICP_OBJS_PPC_PPC64 := \
 	asm-ppc64/blake3/b3_ppc64le_sse2.o \
 	asm-ppc64/blake3/b3_ppc64le_sse41.o \
 	asm-ppc64/sha2/sha256-p8.o \
 	asm-ppc64/sha2/sha512-p8.o \
 	asm-ppc64/sha2/sha256-ppc.o \
 	asm-ppc64/sha2/sha512-ppc.o
 
 zfs-objs             += $(addprefix icp/,$(ICP_OBJS))
 zfs-$(CONFIG_X86)    += $(addprefix icp/,$(ICP_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86))
 zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64))
 zfs-$(CONFIG_ARM)    += $(addprefix icp/,$(ICP_OBJS_ARM))
 zfs-$(CONFIG_ARM64)  += $(addprefix icp/,$(ICP_OBJS_ARM64))
 zfs-$(CONFIG_PPC)    += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
 zfs-$(CONFIG_PPC64)  += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
 
 $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
 	$(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include)
 
 $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
 	$(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include)
 
 # Suppress objtool "return with modified stack frame" warnings.
 OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
 
 # Suppress objtool "unsupported stack pointer realignment" warnings.
 # See #6950 for the reasoning.
 OBJECT_FILES_NON_STANDARD_sha256-x86_64.o := y
 OBJECT_FILES_NON_STANDARD_sha512-x86_64.o := y
 
 LUA_OBJS := \
 	lapi.o \
 	lauxlib.o \
 	lbaselib.o \
 	lcode.o \
 	lcompat.o \
 	lcorolib.o \
 	lctype.o \
 	ldebug.o \
 	ldo.o \
 	lfunc.o \
 	lgc.o \
 	llex.o \
 	lmem.o \
 	lobject.o \
 	lopcodes.o \
 	lparser.o \
 	lstate.o \
 	lstring.o \
 	lstrlib.o \
 	ltable.o \
 	ltablib.o \
 	ltm.o \
 	lvm.o \
 	lzio.o \
 	setjmp/setjmp.o
 
 zfs-objs += $(addprefix lua/,$(LUA_OBJS))
 
 
 NVPAIR_OBJS := \
 	fnvpair.o \
 	nvpair.o \
 	nvpair_alloc_fixed.o \
 	nvpair_alloc_spl.o
 
 zfs-objs += $(addprefix nvpair/,$(NVPAIR_OBJS))
 
 
 UNICODE_OBJS := \
 	u8_textprep.o \
 	uconv.o
 
 zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS))
 
 
 ZCOMMON_OBJS := \
 	cityhash.o \
 	zfeature_common.o \
 	zfs_comutil.o \
 	zfs_deleg.o \
 	zfs_fletcher.o \
 	zfs_fletcher_superscalar.o \
 	zfs_fletcher_superscalar4.o \
 	zfs_namecheck.o \
 	zfs_prop.o \
 	zpool_prop.o \
 	zprop_common.o
 
 ZCOMMON_OBJS_X86 := \
 	zfs_fletcher_avx512.o \
 	zfs_fletcher_intel.o \
 	zfs_fletcher_sse.o
 
 ZCOMMON_OBJS_ARM64 := \
 	zfs_fletcher_aarch64_neon.o
 
 zfs-objs            += $(addprefix zcommon/,$(ZCOMMON_OBJS))
 zfs-$(CONFIG_X86)   += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix zcommon/,$(ZCOMMON_OBJS_X86))
 zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64))
 
 
 # Zstd uses -O3 by default, so we should follow
 ZFS_ZSTD_FLAGS := -O3
 
 # -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h
 # Set it for other compilers, too.
 ZFS_ZSTD_FLAGS += -fno-tree-vectorize
 
 # SSE register return with SSE disabled if -march=znverX is passed
 ZFS_ZSTD_FLAGS += -U__BMI__
 
 # Quiet warnings about frame size due to unused code in unmodified zstd lib
 ZFS_ZSTD_FLAGS += -Wframe-larger-than=20480
 
 ZSTD_OBJS := \
 	zfs_zstd.o \
 	zstd_sparc.o
 
 ZSTD_UPSTREAM_OBJS := \
 	lib/common/entropy_common.o \
 	lib/common/error_private.o \
 	lib/common/fse_decompress.o \
 	lib/common/pool.o \
 	lib/common/zstd_common.o \
 	lib/compress/fse_compress.o \
 	lib/compress/hist.o \
 	lib/compress/huf_compress.o \
 	lib/compress/zstd_compress.o \
 	lib/compress/zstd_compress_literals.o \
 	lib/compress/zstd_compress_sequences.o \
 	lib/compress/zstd_compress_superblock.o \
 	lib/compress/zstd_double_fast.o \
 	lib/compress/zstd_fast.o \
 	lib/compress/zstd_lazy.o \
 	lib/compress/zstd_ldm.o \
 	lib/compress/zstd_opt.o \
 	lib/decompress/huf_decompress.o \
 	lib/decompress/zstd_ddict.o \
 	lib/decompress/zstd_decompress.o \
 	lib/decompress/zstd_decompress_block.o
 
 zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS))
 
 # Disable aarch64 neon SIMD instructions for kernel mode
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS)
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include)
 $(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
 $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h
 
 
 ZFS_OBJS := \
 	abd.o \
 	aggsum.o \
 	arc.o \
 	blake3_zfs.o \
 	blkptr.o \
 	bplist.o \
 	bpobj.o \
 	bptree.o \
 	bqueue.o \
 	brt.o \
 	btree.o \
 	dataset_kstats.o \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
 	ddt_zap.o \
 	dmu.o \
 	dmu_diff.o \
 	dmu_object.o \
 	dmu_objset.o \
 	dmu_recv.o \
 	dmu_redact.o \
 	dmu_send.o \
 	dmu_traverse.o \
 	dmu_tx.o \
 	dmu_zfetch.o \
 	dnode.o \
 	dnode_sync.o \
 	dsl_bookmark.o \
 	dsl_crypt.o \
 	dsl_dataset.o \
 	dsl_deadlist.o \
 	dsl_deleg.o \
 	dsl_destroy.o \
 	dsl_dir.o \
 	dsl_pool.o \
 	dsl_prop.o \
 	dsl_scan.o \
 	dsl_synctask.o \
 	dsl_userhold.o \
 	edonr_zfs.o \
 	fm.o \
 	gzip.o \
 	hkdf.o \
 	lz4.o \
 	lz4_zfs.o \
 	lzjb.o \
 	metaslab.o \
 	mmp.o \
 	multilist.o \
 	objlist.o \
 	pathname.o \
 	range_tree.o \
 	refcount.o \
 	rrwlock.o \
 	sa.o \
 	sha2_zfs.o \
 	skein_zfs.o \
 	spa.o \
 	spa_checkpoint.o \
 	spa_config.o \
 	spa_errlog.o \
 	spa_history.o \
 	spa_log_spacemap.o \
 	spa_misc.o \
 	spa_stats.o \
 	space_map.o \
 	space_reftree.o \
 	txg.o \
 	uberblock.o \
 	unique.o \
 	vdev.o \
 	vdev_draid.o \
 	vdev_draid_rand.o \
 	vdev_indirect.o \
 	vdev_indirect_births.o \
 	vdev_indirect_mapping.o \
 	vdev_initialize.o \
 	vdev_label.o \
 	vdev_mirror.o \
 	vdev_missing.o \
 	vdev_queue.o \
 	vdev_raidz.o \
 	vdev_raidz_math.o \
 	vdev_raidz_math_scalar.o \
 	vdev_rebuild.o \
 	vdev_removal.o \
 	vdev_root.o \
 	vdev_trim.o \
 	zap.o \
 	zap_leaf.o \
 	zap_micro.o \
 	zcp.o \
 	zcp_get.o \
 	zcp_global.o \
 	zcp_iter.o \
 	zcp_set.o \
 	zcp_synctask.o \
 	zfeature.o \
 	zfs_byteswap.o \
 	zfs_chksum.o \
 	zfs_fm.o \
 	zfs_fuid.o \
 	zfs_impl.o \
 	zfs_ioctl.o \
 	zfs_log.o \
 	zfs_onexit.o \
 	zfs_quota.o \
 	zfs_ratelimit.o \
 	zfs_replay.o \
 	zfs_rlock.o \
 	zfs_sa.o \
 	zfs_vnops.o \
 	zil.o \
 	zio.o \
 	zio_checksum.o \
 	zio_compress.o \
 	zio_inject.o \
 	zle.o \
 	zrlock.o \
 	zthr.o \
 	zvol.o
 
 ZFS_OBJS_OS := \
 	abd_os.o \
 	arc_os.o \
 	mmp_os.o \
 	policy.o \
 	qat.o \
 	qat_compress.o \
 	qat_crypt.o \
 	spa_misc_os.o \
 	trace.o \
 	vdev_disk.o \
 	vdev_file.o \
 	zfs_acl.o \
 	zfs_ctldir.o \
 	zfs_debug.o \
 	zfs_dir.o \
 	zfs_file_os.o \
 	zfs_ioctl_os.o \
 	zfs_racct.o \
 	zfs_sysfs.o \
 	zfs_uio.o \
 	zfs_vfsops.o \
 	zfs_vnops_os.o \
 	zfs_znode.o \
 	zio_crypt.o \
 	zpl_ctldir.o \
 	zpl_export.o \
 	zpl_file.o \
 	zpl_file_range.o \
 	zpl_inode.o \
 	zpl_super.o \
 	zpl_xattr.o \
 	zvol_os.o
 
 ZFS_OBJS_X86 := \
 	vdev_raidz_math_avx2.o \
 	vdev_raidz_math_avx512bw.o \
 	vdev_raidz_math_avx512f.o \
 	vdev_raidz_math_sse2.o \
 	vdev_raidz_math_ssse3.o
 
 ZFS_OBJS_ARM64 := \
 	vdev_raidz_math_aarch64_neon.o \
 	vdev_raidz_math_aarch64_neonx2.o
 
 ZFS_OBJS_PPC_PPC64 := \
 	vdev_raidz_math_powerpc_altivec.o
 
 zfs-objs            += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS))
 zfs-$(CONFIG_X86)   += $(addprefix zfs/,$(ZFS_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix zfs/,$(ZFS_OBJS_X86))
 zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64))
 zfs-$(CONFIG_PPC)   += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 
 UBSAN_SANITIZE_zap_leaf.o := n
 UBSAN_SANITIZE_zap_micro.o := n
 UBSAN_SANITIZE_sa.o := n
 UBSAN_SANITIZE_zfs/zap_micro.o := n
 UBSAN_SANITIZE_zfs/sa.o := n
 
 # Suppress incorrect warnings from versions of objtool which are not
 # aware of x86 EVEX prefix instructions used for AVX512.
 OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y
 OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y
 
 ifeq ($(CONFIG_ALTIVEC),y)
 $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec
 endif
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
index 01ce5cbd814c..0f24319511d7 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
@@ -1,309 +1,313 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
 #include <sys/simd.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_impl.h>
 #include <sys/sha2.h>
 
 #include <sha2/sha2_impl.h>
 #include <sys/asm_linkage.h>
 
 #define	TF(E, N) \
 	extern void ASMABI E(uint32_t s[8], const void *, size_t); \
 	static inline void N(uint32_t s[8], const void *d, size_t b) { \
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
 
 #if defined(__x86_64)
 
 /* Users of ASMABI requires all calls to be from wrappers */
 extern void ASMABI
 zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
 
 static inline void
 tf_sha256_transform_x64(uint32_t s[8], const void *d, size_t b)
 {
 	zfs_sha256_transform_x64(s, d, b);
 }
 
 const sha256_ops_t sha256_x64_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha256_transform_x64,
 	.name = "x64"
 };
 
 #if defined(HAVE_SSSE3)
 static boolean_t sha2_have_ssse3(void)
 {
 	return (kfpu_allowed() && zfs_ssse3_available());
 }
 
 TF(zfs_sha256_transform_ssse3, tf_sha256_ssse3);
 const sha256_ops_t sha256_ssse3_impl = {
 	.is_supported = sha2_have_ssse3,
 	.transform = tf_sha256_ssse3,
 	.name = "ssse3"
 };
 #endif
 
 #if defined(HAVE_AVX)
 static boolean_t sha2_have_avx(void)
 {
 	return (kfpu_allowed() && zfs_avx_available());
 }
 
 TF(zfs_sha256_transform_avx, tf_sha256_avx);
 const sha256_ops_t sha256_avx_impl = {
 	.is_supported = sha2_have_avx,
 	.transform = tf_sha256_avx,
 	.name = "avx"
 };
 #endif
 
 #if defined(HAVE_AVX2)
 static boolean_t sha2_have_avx2(void)
 {
 	return (kfpu_allowed() && zfs_avx2_available());
 }
 
 TF(zfs_sha256_transform_avx2, tf_sha256_avx2);
 const sha256_ops_t sha256_avx2_impl = {
 	.is_supported = sha2_have_avx2,
 	.transform = tf_sha256_avx2,
 	.name = "avx2"
 };
 #endif
 
 #if defined(HAVE_SSE4_1)
 static boolean_t sha2_have_shani(void)
 {
 	return (kfpu_allowed() && zfs_sse4_1_available() && \
 	    zfs_shani_available());
 }
 
 TF(zfs_sha256_transform_shani, tf_sha256_shani);
 const sha256_ops_t sha256_shani_impl = {
 	.is_supported = sha2_have_shani,
 	.transform = tf_sha256_shani,
 	.name = "shani"
 };
 #endif
 
-#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6)
+#elif defined(__aarch64__) || defined(__arm__)
+extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
+const sha256_ops_t sha256_armv7_impl = {
+	.is_supported = sha2_is_supported,
+	.transform = zfs_sha256_block_armv7,
+	.name = "armv7"
+};
+
+#if __ARM_ARCH > 6
 static boolean_t sha256_have_neon(void)
 {
 	return (kfpu_allowed() && zfs_neon_available());
 }
 
 static boolean_t sha256_have_armv8ce(void)
 {
 	return (kfpu_allowed() && zfs_sha256_available());
 }
 
-extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
-const sha256_ops_t sha256_armv7_impl = {
-	.is_supported = sha2_is_supported,
-	.transform = zfs_sha256_block_armv7,
-	.name = "armv7"
-};
-
 TF(zfs_sha256_block_neon, tf_sha256_neon);
 const sha256_ops_t sha256_neon_impl = {
 	.is_supported = sha256_have_neon,
 	.transform = tf_sha256_neon,
 	.name = "neon"
 };
 
 TF(zfs_sha256_block_armv8, tf_sha256_armv8ce);
 const sha256_ops_t sha256_armv8_impl = {
 	.is_supported = sha256_have_armv8ce,
 	.transform = tf_sha256_armv8ce,
 	.name = "armv8-ce"
 };
+#endif
 
 #elif defined(__PPC64__)
 static boolean_t sha256_have_isa207(void)
 {
 	return (kfpu_allowed() && zfs_isa207_available());
 }
 
 TF(zfs_sha256_ppc, tf_sha256_ppc);
 const sha256_ops_t sha256_ppc_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha256_ppc,
 	.name = "ppc"
 };
 
 TF(zfs_sha256_power8, tf_sha256_power8);
 const sha256_ops_t sha256_power8_impl = {
 	.is_supported = sha256_have_isa207,
 	.transform = tf_sha256_power8,
 	.name = "power8"
 };
 #endif /* __PPC64__ */
 
 /* the two generic ones */
 extern const sha256_ops_t sha256_generic_impl;
 
 /* array with all sha256 implementations */
 static const sha256_ops_t *const sha256_impls[] = {
 	&sha256_generic_impl,
 #if defined(__x86_64)
 	&sha256_x64_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_SSSE3)
 	&sha256_ssse3_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX)
 	&sha256_avx_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX2)
 	&sha256_avx2_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_SSE4_1)
 	&sha256_shani_impl,
 #endif
-#if defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6)
+#if defined(__aarch64__) || defined(__arm__)
 	&sha256_armv7_impl,
+#if __ARM_ARCH > 6
 	&sha256_neon_impl,
 	&sha256_armv8_impl,
 #endif
+#endif
 #if defined(__PPC64__)
 	&sha256_ppc_impl,
 	&sha256_power8_impl,
 #endif /* __PPC64__ */
 };
 
 /* use the generic implementation functions */
 #define	IMPL_NAME		"sha256"
 #define	IMPL_OPS_T		sha256_ops_t
 #define	IMPL_ARRAY		sha256_impls
 #define	IMPL_GET_OPS		sha256_get_ops
 #define	ZFS_IMPL_OPS		zfs_sha256_ops
 #include <generic_impl.c>
 
 #ifdef _KERNEL
 
 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
 
 #if defined(__linux__)
 
 static int
 sha256_param_get(char *buffer, zfs_kernel_param_t *unused)
 {
 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
 	char *fmt;
 	int cnt = 0;
 
 	/* cycling */
 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
 	cnt += sprintf(buffer + cnt, fmt, "cycle");
 
 	/* list fastest */
 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
 	cnt += sprintf(buffer + cnt, fmt, "fastest");
 
 	/* list all supported implementations */
 	generic_impl_init();
 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 		fmt = IMPL_FMT(impl, i);
 		cnt += sprintf(buffer + cnt, fmt,
 		    generic_supp_impls[i]->name);
 	}
 
 	return (cnt);
 }
 
 static int
 sha256_param_set(const char *val, zfs_kernel_param_t *unused)
 {
 	(void) unused;
 	return (generic_impl_setname(val));
 }
 
 #elif defined(__FreeBSD__)
 
 #include <sys/sbuf.h>
 
 static int
 sha256_param(ZFS_MODULE_PARAM_ARGS)
 {
 	int err;
 
 	generic_impl_init();
 	if (req->newptr == NULL) {
 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
 		const int init_buflen = 64;
 		const char *fmt;
 		struct sbuf *s;
 
 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
 
 		/* cycling */
 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
 		(void) sbuf_printf(s, fmt, "cycle");
 
 		/* list fastest */
 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
 		(void) sbuf_printf(s, fmt, "fastest");
 
 		/* list all supported implementations */
 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 			fmt = IMPL_FMT(impl, i);
 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
 		}
 
 		err = sbuf_finish(s);
 		sbuf_delete(s);
 
 		return (err);
 	}
 
 	char buf[16];
 
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err) {
 		return (err);
 	}
 
 	return (-generic_impl_setname(buf));
 }
 #endif
 
 #undef IMPL_FMT
 
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha256_impl,
     sha256_param_set, sha256_param_get, ZMOD_RW, \
 	"Select SHA256 implementation.");
 #endif
 
 #undef TF
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
index 27b35a639a54..6291fbd77e36 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
@@ -1,285 +1,282 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
 #include <sys/simd.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_impl.h>
 #include <sys/sha2.h>
 
 #include <sha2/sha2_impl.h>
 #include <sys/asm_linkage.h>
 
 #define	TF(E, N) \
 	extern void ASMABI E(uint64_t s[8], const void *, size_t); \
 	static inline void N(uint64_t s[8], const void *d, size_t b) { \
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
 
 #if defined(__x86_64)
 
 /* Users of ASMABI requires all calls to be from wrappers */
 extern void ASMABI
 zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
 
 static inline void
 tf_sha512_transform_x64(uint64_t s[8], const void *d, size_t b)
 {
 	zfs_sha512_transform_x64(s, d, b);
 }
 const sha512_ops_t sha512_x64_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha512_transform_x64,
 	.name = "x64"
 };
 
 #if defined(HAVE_AVX)
 static boolean_t sha2_have_avx(void)
 {
 	return (kfpu_allowed() && zfs_avx_available());
 }
 
 TF(zfs_sha512_transform_avx, tf_sha512_avx);
 const sha512_ops_t sha512_avx_impl = {
 	.is_supported = sha2_have_avx,
 	.transform = tf_sha512_avx,
 	.name = "avx"
 };
 #endif
 
 #if defined(HAVE_AVX2)
 static boolean_t sha2_have_avx2(void)
 {
 	return (kfpu_allowed() && zfs_avx2_available());
 }
 
 TF(zfs_sha512_transform_avx2, tf_sha512_avx2);
 const sha512_ops_t sha512_avx2_impl = {
 	.is_supported = sha2_have_avx2,
 	.transform = tf_sha512_avx2,
 	.name = "avx2"
 };
 #endif
 
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(__arm__)
 extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
 const sha512_ops_t sha512_armv7_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = zfs_sha512_block_armv7,
 	.name = "armv7"
 };
 
+#if defined(__aarch64__)
 static boolean_t sha512_have_armv8ce(void)
 {
 	return (kfpu_allowed() && zfs_sha512_available());
 }
 
 TF(zfs_sha512_block_armv8, tf_sha512_armv8ce);
 const sha512_ops_t sha512_armv8_impl = {
 	.is_supported = sha512_have_armv8ce,
 	.transform = tf_sha512_armv8ce,
 	.name = "armv8-ce"
 };
+#endif
 
-#elif defined(__arm__) && __ARM_ARCH > 6
-extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
-const sha512_ops_t sha512_armv7_impl = {
-	.is_supported = sha2_is_supported,
-	.transform = zfs_sha512_block_armv7,
-	.name = "armv7"
-};
-
+#if defined(__arm__) && __ARM_ARCH > 6
 static boolean_t sha512_have_neon(void)
 {
 	return (kfpu_allowed() && zfs_neon_available());
 }
 
 TF(zfs_sha512_block_neon, tf_sha512_neon);
 const sha512_ops_t sha512_neon_impl = {
 	.is_supported = sha512_have_neon,
 	.transform = tf_sha512_neon,
 	.name = "neon"
 };
+#endif
 
 #elif defined(__PPC64__)
 TF(zfs_sha512_ppc, tf_sha512_ppc);
 const sha512_ops_t sha512_ppc_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha512_ppc,
 	.name = "ppc"
 };
 
 static boolean_t sha512_have_isa207(void)
 {
 	return (kfpu_allowed() && zfs_isa207_available());
 }
 
 TF(zfs_sha512_power8, tf_sha512_power8);
 const sha512_ops_t sha512_power8_impl = {
 	.is_supported = sha512_have_isa207,
 	.transform = tf_sha512_power8,
 	.name = "power8"
 };
 #endif /* __PPC64__ */
 
 /* the two generic ones */
 extern const sha512_ops_t sha512_generic_impl;
 
 /* array with all sha512 implementations */
 static const sha512_ops_t *const sha512_impls[] = {
 	&sha512_generic_impl,
 #if defined(__x86_64)
 	&sha512_x64_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX)
 	&sha512_avx_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX2)
 	&sha512_avx2_impl,
 #endif
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__arm__)
 	&sha512_armv7_impl,
+#if defined(__aarch64__)
 	&sha512_armv8_impl,
 #endif
 #if defined(__arm__) && __ARM_ARCH > 6
-	&sha512_armv7_impl,
 	&sha512_neon_impl,
 #endif
+#endif
 #if defined(__PPC64__)
 	&sha512_ppc_impl,
 	&sha512_power8_impl,
 #endif /* __PPC64__ */
 };
 
 /* use the generic implementation functions */
 #define	IMPL_NAME		"sha512"
 #define	IMPL_OPS_T		sha512_ops_t
 #define	IMPL_ARRAY		sha512_impls
 #define	IMPL_GET_OPS		sha512_get_ops
 #define	ZFS_IMPL_OPS		zfs_sha512_ops
 #include <generic_impl.c>
 
 #ifdef _KERNEL
 
 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
 
 #if defined(__linux__)
 
 static int
 sha512_param_get(char *buffer, zfs_kernel_param_t *unused)
 {
 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
 	char *fmt;
 	int cnt = 0;
 
 	/* cycling */
 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
 	cnt += sprintf(buffer + cnt, fmt, "cycle");
 
 	/* list fastest */
 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
 	cnt += sprintf(buffer + cnt, fmt, "fastest");
 
 	/* list all supported implementations */
 	generic_impl_init();
 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 		fmt = IMPL_FMT(impl, i);
 		cnt += sprintf(buffer + cnt, fmt,
 		    generic_supp_impls[i]->name);
 	}
 
 	return (cnt);
 }
 
 static int
 sha512_param_set(const char *val, zfs_kernel_param_t *unused)
 {
 	(void) unused;
 	return (generic_impl_setname(val));
 }
 
 #elif defined(__FreeBSD__)
 
 #include <sys/sbuf.h>
 
 static int
 sha512_param(ZFS_MODULE_PARAM_ARGS)
 {
 	int err;
 
 	generic_impl_init();
 	if (req->newptr == NULL) {
 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
 		const int init_buflen = 64;
 		const char *fmt;
 		struct sbuf *s;
 
 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
 
 		/* cycling */
 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
 		(void) sbuf_printf(s, fmt, "cycle");
 
 		/* list fastest */
 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
 		(void) sbuf_printf(s, fmt, "fastest");
 
 		/* list all supported implementations */
 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 			fmt = IMPL_FMT(impl, i);
 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
 		}
 
 		err = sbuf_finish(s);
 		sbuf_delete(s);
 
 		return (err);
 	}
 
 	/* we got module parameter */
 	char buf[16];
 
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err) {
 		return (err);
 	}
 
 	return (-generic_impl_setname(buf));
 }
 #endif
 
 #undef IMPL_FMT
 
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha512_impl,
     sha512_param_set, sha512_param_get, ZMOD_RW, \
 	"Select SHA512 implementation.");
 #endif
 
 #undef TF
diff --git a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S
index 0001e4d69055..3ae66626df31 100644
--- a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S
+++ b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha256-armv7.S
@@ -1,2769 +1,2774 @@
 /*
  * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 /*
  * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  * - modified assembly to fit into OpenZFS
  */
 
 #if defined(__arm__)
 
-#define	__ARM_ARCH__      7
-#define	__ARM_MAX_ARCH__  7
+#ifndef __ARM_ARCH
+# define __ARM_ARCH__	7
+#else
+# define __ARM_ARCH__	__ARM_ARCH
+#endif
 
 #if defined(__thumb2__)
 .syntax unified
 .thumb
 #else
 .code   32
 #endif
 
 .text
 
 .type	K256,%object
 .align	5
 K256:
 .word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 .word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 .word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 .word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 .word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 .word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 .word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 .word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 .word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 .word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 .word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 .word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 .word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 .word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 .word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 .word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size	K256,.-K256
 .word	0				@ terminator
 
 .align	5
 .globl	zfs_sha256_block_armv7
 .type	zfs_sha256_block_armv7,%function
 zfs_sha256_block_armv7:
 .Lzfs_sha256_block_armv7:
 
 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r3,pc,#8		@ zfs_sha256_block_armv7
 #else
 	adr	r3,.Lzfs_sha256_block_armv7
 #endif
 
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
 	stmdb	sp!,{r0,r1,r2,r4-r11,lr}
 	ldmia	r0,{r4,r5,r6,r7,r8,r9,r10,r11}
 	sub	r14,r3,#256+32	@ K256
 	sub	sp,sp,#16*4		@ alloca(X[16])
 .Loop:
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6		@ magic
 	eor	r12,r12,r12
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 0
 # if 0==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 0
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 0==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#0*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 0==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 0<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 1
 # if 1==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 1
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 1==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#1*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 1==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 1<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 2
 # if 2==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 2
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 2==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#2*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 2==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 2<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 3
 # if 3==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 3
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 3==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#3*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 3==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 3<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 4
 # if 4==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 4
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 4==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#4*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 4==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 4<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 5
 # if 5==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 5
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 5==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#5*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 5==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 5<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 6
 # if 6==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 6
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 6==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#6*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 6==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 6<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 7
 # if 7==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 7
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 7==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#7*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 7==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 7<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 8
 # if 8==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 8
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 8==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r8,r8,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#8*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 8==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 8<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 9
 # if 9==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 9
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 9==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r7,r7,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#9*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 9==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 9<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 10
 # if 10==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 10
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 10==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r6,r6,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#10*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 10==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 10<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 11
 # if 11==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 11
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 11==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r5,r5,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#11*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 11==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 11<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 12
 # if 12==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 12
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 12==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r4,r4,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#12*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 12==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 12<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 13
 # if 13==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 13
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 13==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r11,r11,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#13*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 13==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 13<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 14
 # if 14==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 14
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	ldrb	r12,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r12,lsl#8
 	ldrb	r12,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 14==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r10,r10,ror#5
 	orr	r2,r2,r12,lsl#24
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 #endif
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#14*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 14==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 14<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 #if __ARM_ARCH__>=7
 	@ ldr	r2,[r1],#4			@ 15
 # if 15==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 # ifndef __ARMEB__
 	rev	r2,r2
 # endif
 #else
 	@ ldrb	r2,[r1,#3]			@ 15
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	ldrb	r3,[r1,#2]
 	ldrb	r0,[r1,#1]
 	orr	r2,r2,r3,lsl#8
 	ldrb	r3,[r1],#4
 	orr	r2,r2,r0,lsl#16
 # if 15==15
 	str	r1,[sp,#17*4]			@ make room for r1
 # endif
 	eor	r0,r9,r9,ror#5
 	orr	r2,r2,r3,lsl#24
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 #endif
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#15*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 15==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 15<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 .Lrounds_16_xx:
 	@ ldr	r2,[sp,#1*4]		@ 16
 	@ ldr	r1,[sp,#14*4]
 	mov	r0,r2,ror#7
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#0*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#9*4]
 
 	add	r12,r12,r0
 	eor	r0,r8,r8,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#0*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 16==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 16<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#2*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#15*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#2*4]		@ 17
 	@ ldr	r1,[sp,#15*4]
 	mov	r0,r2,ror#7
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#1*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#10*4]
 
 	add	r3,r3,r0
 	eor	r0,r7,r7,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#1*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 17==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 17<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#3*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#0*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#3*4]		@ 18
 	@ ldr	r1,[sp,#0*4]
 	mov	r0,r2,ror#7
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#2*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#11*4]
 
 	add	r12,r12,r0
 	eor	r0,r6,r6,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#2*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 18==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 18<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#4*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#1*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#4*4]		@ 19
 	@ ldr	r1,[sp,#1*4]
 	mov	r0,r2,ror#7
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#3*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#12*4]
 
 	add	r3,r3,r0
 	eor	r0,r5,r5,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#3*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 19==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 19<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#5*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#2*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#5*4]		@ 20
 	@ ldr	r1,[sp,#2*4]
 	mov	r0,r2,ror#7
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#4*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#13*4]
 
 	add	r12,r12,r0
 	eor	r0,r4,r4,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#4*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 20==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 20<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#6*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#3*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#6*4]		@ 21
 	@ ldr	r1,[sp,#3*4]
 	mov	r0,r2,ror#7
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#5*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#14*4]
 
 	add	r3,r3,r0
 	eor	r0,r11,r11,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#5*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 21==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 21<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#7*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#4*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#7*4]		@ 22
 	@ ldr	r1,[sp,#4*4]
 	mov	r0,r2,ror#7
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#6*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#15*4]
 
 	add	r12,r12,r0
 	eor	r0,r10,r10,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#6*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 22==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 22<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#8*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#5*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#8*4]		@ 23
 	@ ldr	r1,[sp,#5*4]
 	mov	r0,r2,ror#7
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#7*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#0*4]
 
 	add	r3,r3,r0
 	eor	r0,r9,r9,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#7*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 23==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 23<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#9*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#6*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#9*4]		@ 24
 	@ ldr	r1,[sp,#6*4]
 	mov	r0,r2,ror#7
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#8*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#1*4]
 
 	add	r12,r12,r0
 	eor	r0,r8,r8,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r8,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r11,r11,r2			@ h+=X[i]
 	str	r2,[sp,#8*4]
 	eor	r2,r9,r10
 	add	r11,r11,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r8
 	add	r11,r11,r12			@ h+=K256[i]
 	eor	r2,r2,r10			@ Ch(e,f,g)
 	eor	r0,r4,r4,ror#11
 	add	r11,r11,r2			@ h+=Ch(e,f,g)
 #if 24==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 24<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r4,r5			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#10*4]		@ from future BODY_16_xx
 	eor	r12,r4,r5			@ a^b, b^c in next round
 	ldr	r1,[sp,#7*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r4,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r7,r7,r11			@ d+=h
 	eor	r3,r3,r5			@ Maj(a,b,c)
 	add	r11,r11,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r11,r11,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#10*4]		@ 25
 	@ ldr	r1,[sp,#7*4]
 	mov	r0,r2,ror#7
 	add	r11,r11,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#9*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#2*4]
 
 	add	r3,r3,r0
 	eor	r0,r7,r7,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r7,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r10,r10,r2			@ h+=X[i]
 	str	r2,[sp,#9*4]
 	eor	r2,r8,r9
 	add	r10,r10,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r7
 	add	r10,r10,r3			@ h+=K256[i]
 	eor	r2,r2,r9			@ Ch(e,f,g)
 	eor	r0,r11,r11,ror#11
 	add	r10,r10,r2			@ h+=Ch(e,f,g)
 #if 25==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 25<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r11,r4			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#11*4]		@ from future BODY_16_xx
 	eor	r3,r11,r4			@ a^b, b^c in next round
 	ldr	r1,[sp,#8*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r11,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r6,r6,r10			@ d+=h
 	eor	r12,r12,r4			@ Maj(a,b,c)
 	add	r10,r10,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r10,r10,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#11*4]		@ 26
 	@ ldr	r1,[sp,#8*4]
 	mov	r0,r2,ror#7
 	add	r10,r10,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#10*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#3*4]
 
 	add	r12,r12,r0
 	eor	r0,r6,r6,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r6,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r9,r9,r2			@ h+=X[i]
 	str	r2,[sp,#10*4]
 	eor	r2,r7,r8
 	add	r9,r9,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r6
 	add	r9,r9,r12			@ h+=K256[i]
 	eor	r2,r2,r8			@ Ch(e,f,g)
 	eor	r0,r10,r10,ror#11
 	add	r9,r9,r2			@ h+=Ch(e,f,g)
 #if 26==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 26<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r10,r11			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#12*4]		@ from future BODY_16_xx
 	eor	r12,r10,r11			@ a^b, b^c in next round
 	ldr	r1,[sp,#9*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r10,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r5,r5,r9			@ d+=h
 	eor	r3,r3,r11			@ Maj(a,b,c)
 	add	r9,r9,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r9,r9,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#12*4]		@ 27
 	@ ldr	r1,[sp,#9*4]
 	mov	r0,r2,ror#7
 	add	r9,r9,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#11*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#4*4]
 
 	add	r3,r3,r0
 	eor	r0,r5,r5,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r5,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r8,r8,r2			@ h+=X[i]
 	str	r2,[sp,#11*4]
 	eor	r2,r6,r7
 	add	r8,r8,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r5
 	add	r8,r8,r3			@ h+=K256[i]
 	eor	r2,r2,r7			@ Ch(e,f,g)
 	eor	r0,r9,r9,ror#11
 	add	r8,r8,r2			@ h+=Ch(e,f,g)
 #if 27==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 27<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r9,r10			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#13*4]		@ from future BODY_16_xx
 	eor	r3,r9,r10			@ a^b, b^c in next round
 	ldr	r1,[sp,#10*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r9,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r4,r4,r8			@ d+=h
 	eor	r12,r12,r10			@ Maj(a,b,c)
 	add	r8,r8,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r8,r8,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#13*4]		@ 28
 	@ ldr	r1,[sp,#10*4]
 	mov	r0,r2,ror#7
 	add	r8,r8,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#12*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#5*4]
 
 	add	r12,r12,r0
 	eor	r0,r4,r4,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r4,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r7,r7,r2			@ h+=X[i]
 	str	r2,[sp,#12*4]
 	eor	r2,r5,r6
 	add	r7,r7,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r4
 	add	r7,r7,r12			@ h+=K256[i]
 	eor	r2,r2,r6			@ Ch(e,f,g)
 	eor	r0,r8,r8,ror#11
 	add	r7,r7,r2			@ h+=Ch(e,f,g)
 #if 28==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 28<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r8,r9			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#14*4]		@ from future BODY_16_xx
 	eor	r12,r8,r9			@ a^b, b^c in next round
 	ldr	r1,[sp,#11*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r8,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r11,r11,r7			@ d+=h
 	eor	r3,r3,r9			@ Maj(a,b,c)
 	add	r7,r7,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r7,r7,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#14*4]		@ 29
 	@ ldr	r1,[sp,#11*4]
 	mov	r0,r2,ror#7
 	add	r7,r7,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#13*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#6*4]
 
 	add	r3,r3,r0
 	eor	r0,r11,r11,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r11,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r6,r6,r2			@ h+=X[i]
 	str	r2,[sp,#13*4]
 	eor	r2,r4,r5
 	add	r6,r6,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r11
 	add	r6,r6,r3			@ h+=K256[i]
 	eor	r2,r2,r5			@ Ch(e,f,g)
 	eor	r0,r7,r7,ror#11
 	add	r6,r6,r2			@ h+=Ch(e,f,g)
 #if 29==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 29<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r7,r8			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#15*4]		@ from future BODY_16_xx
 	eor	r3,r7,r8			@ a^b, b^c in next round
 	ldr	r1,[sp,#12*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r7,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r10,r10,r6			@ d+=h
 	eor	r12,r12,r8			@ Maj(a,b,c)
 	add	r6,r6,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r6,r6,r12			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#15*4]		@ 30
 	@ ldr	r1,[sp,#12*4]
 	mov	r0,r2,ror#7
 	add	r6,r6,r12			@ h+=Maj(a,b,c) from the past
 	mov	r12,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r12,r12,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#14*4]
 	eor	r12,r12,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#7*4]
 
 	add	r12,r12,r0
 	eor	r0,r10,r10,ror#5	@ from BODY_00_15
 	add	r2,r2,r12
 	eor	r0,r0,r10,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r12,[r14],#4			@ *K256++
 	add	r5,r5,r2			@ h+=X[i]
 	str	r2,[sp,#14*4]
 	eor	r2,r11,r4
 	add	r5,r5,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r10
 	add	r5,r5,r12			@ h+=K256[i]
 	eor	r2,r2,r4			@ Ch(e,f,g)
 	eor	r0,r6,r6,ror#11
 	add	r5,r5,r2			@ h+=Ch(e,f,g)
 #if 30==31
 	and	r12,r12,#0xff
 	cmp	r12,#0xf2			@ done?
 #endif
 #if 30<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r12,r6,r7			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#0*4]		@ from future BODY_16_xx
 	eor	r12,r6,r7			@ a^b, b^c in next round
 	ldr	r1,[sp,#13*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r6,ror#20	@ Sigma0(a)
 	and	r3,r3,r12			@ (b^c)&=(a^b)
 	add	r9,r9,r5			@ d+=h
 	eor	r3,r3,r7			@ Maj(a,b,c)
 	add	r5,r5,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r5,r5,r3			@ h+=Maj(a,b,c)
 	@ ldr	r2,[sp,#0*4]		@ 31
 	@ ldr	r1,[sp,#13*4]
 	mov	r0,r2,ror#7
 	add	r5,r5,r3			@ h+=Maj(a,b,c) from the past
 	mov	r3,r1,ror#17
 	eor	r0,r0,r2,ror#18
 	eor	r3,r3,r1,ror#19
 	eor	r0,r0,r2,lsr#3	@ sigma0(X[i+1])
 	ldr	r2,[sp,#15*4]
 	eor	r3,r3,r1,lsr#10	@ sigma1(X[i+14])
 	ldr	r1,[sp,#8*4]
 
 	add	r3,r3,r0
 	eor	r0,r9,r9,ror#5	@ from BODY_00_15
 	add	r2,r2,r3
 	eor	r0,r0,r9,ror#19	@ Sigma1(e)
 	add	r2,r2,r1			@ X[i]
 	ldr	r3,[r14],#4			@ *K256++
 	add	r4,r4,r2			@ h+=X[i]
 	str	r2,[sp,#15*4]
 	eor	r2,r10,r11
 	add	r4,r4,r0,ror#6	@ h+=Sigma1(e)
 	and	r2,r2,r9
 	add	r4,r4,r3			@ h+=K256[i]
 	eor	r2,r2,r11			@ Ch(e,f,g)
 	eor	r0,r5,r5,ror#11
 	add	r4,r4,r2			@ h+=Ch(e,f,g)
 #if 31==31
 	and	r3,r3,#0xff
 	cmp	r3,#0xf2			@ done?
 #endif
 #if 31<15
 # if __ARM_ARCH__>=7
 	ldr	r2,[r1],#4			@ prefetch
 # else
 	ldrb	r2,[r1,#3]
 # endif
 	eor	r3,r5,r6			@ a^b, b^c in next round
 #else
 	ldr	r2,[sp,#1*4]		@ from future BODY_16_xx
 	eor	r3,r5,r6			@ a^b, b^c in next round
 	ldr	r1,[sp,#14*4]	@ from future BODY_16_xx
 #endif
 	eor	r0,r0,r5,ror#20	@ Sigma0(a)
 	and	r12,r12,r3			@ (b^c)&=(a^b)
 	add	r8,r8,r4			@ d+=h
 	eor	r12,r12,r6			@ Maj(a,b,c)
 	add	r4,r4,r0,ror#2	@ h+=Sigma0(a)
 	@ add	r4,r4,r12			@ h+=Maj(a,b,c)
 #ifdef	__thumb2__
 	ite	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	ldreq	r3,[sp,#16*4]		@ pull ctx
 	bne	.Lrounds_16_xx
 
 	add	r4,r4,r12		@ h+=Maj(a,b,c) from the past
 	ldr	r0,[r3,#0]
 	ldr	r2,[r3,#4]
 	ldr	r12,[r3,#8]
 	add	r4,r4,r0
 	ldr	r0,[r3,#12]
 	add	r5,r5,r2
 	ldr	r2,[r3,#16]
 	add	r6,r6,r12
 	ldr	r12,[r3,#20]
 	add	r7,r7,r0
 	ldr	r0,[r3,#24]
 	add	r8,r8,r2
 	ldr	r2,[r3,#28]
 	add	r9,r9,r12
 	ldr	r1,[sp,#17*4]		@ pull inp
 	ldr	r12,[sp,#18*4]		@ pull inp+len
 	add	r10,r10,r0
 	add	r11,r11,r2
 	stmia	r3,{r4,r5,r6,r7,r8,r9,r10,r11}
 	cmp	r1,r12
 	sub	r14,r14,#256	@ rewind Ktbl
 	bne	.Loop
 
 	add	sp,sp,#19*4	@ destroy frame
 #if __ARM_ARCH__>=5
 	ldmia	sp!,{r4-r11,pc}
 #else
 	ldmia	sp!,{r4-r11,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 	.word	0xe12fff1e			@ interoperable with Thumb ISA:-)
 #endif
 .size	zfs_sha256_block_armv7,.-zfs_sha256_block_armv7
 
+#if __ARM_ARCH__ >= 7
 .arch	armv7-a
 .fpu	neon
 
 .globl	zfs_sha256_block_neon
 .type	zfs_sha256_block_neon,%function
 .align	5
 .skip	16
 zfs_sha256_block_neon:
 .LNEON:
 	stmdb	sp!,{r4-r12,lr}
 
 	sub	r11,sp,#16*4+16
 	adr	r14,K256
 	bic	r11,r11,#15		@ align for 128-bit stores
 	mov	r12,sp
 	mov	sp,r11			@ alloca
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
 
 	vld1.8		{q0},[r1]!
 	vld1.8		{q1},[r1]!
 	vld1.8		{q2},[r1]!
 	vld1.8		{q3},[r1]!
 	vld1.32		{q8},[r14,:128]!
 	vld1.32		{q9},[r14,:128]!
 	vld1.32		{q10},[r14,:128]!
 	vld1.32		{q11},[r14,:128]!
 	vrev32.8	q0,q0		@ yes, even on
 	str		r0,[sp,#64]
 	vrev32.8	q1,q1		@ big-endian
 	str		r1,[sp,#68]
 	mov		r1,sp
 	vrev32.8	q2,q2
 	str		r2,[sp,#72]
 	vrev32.8	q3,q3
 	str		r12,[sp,#76]		@ save original sp
 	vadd.i32	q8,q8,q0
 	vadd.i32	q9,q9,q1
 	vst1.32		{q8},[r1,:128]!
 	vadd.i32	q10,q10,q2
 	vst1.32		{q9},[r1,:128]!
 	vadd.i32	q11,q11,q3
 	vst1.32		{q10},[r1,:128]!
 	vst1.32		{q11},[r1,:128]!
 
 	ldmia		r0,{r4-r11}
 	sub		r1,r1,#64
 	ldr		r2,[sp,#0]
 	eor		r12,r12,r12
 	eor		r3,r5,r6
 	b		.L_00_48
 
 .align	4
 .L_00_48:
 	vext.8	q8,q0,q1,#4
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	vext.8	q9,q2,q3,#4
 	add	r4,r4,r12
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vadd.i32	q0,q0,q9
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#4]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	veor	q9,q9,q10
 	add	r10,r10,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	vshr.u32	d24,d7,#17
 	add	r11,r11,r3
 	and	r2,r2,r7
 	veor	q9,q9,q11
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	vsli.32	d24,d7,#15
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	vshr.u32	d25,d7,#10
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	vadd.i32	q0,q0,q9
 	add	r10,r10,r2
 	ldr	r2,[sp,#8]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r6,r6,r10
 	vshr.u32	d24,d7,#19
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	vsli.32	d24,d7,#13
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	veor	d25,d25,d24
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	vadd.i32	d0,d0,d25
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	vshr.u32	d24,d0,#17
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	vsli.32	d24,d0,#15
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	vshr.u32	d25,d0,#10
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#12]
 	and	r3,r3,r12
 	vshr.u32	d24,d0,#19
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	vld1.32	{q8},[r14,:128]!
 	add	r8,r8,r2
 	vsli.32	d24,d0,#13
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	veor	d25,d25,d24
 	add	r9,r9,r3
 	and	r2,r2,r5
 	vadd.i32	d1,d1,d25
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	vadd.i32	q8,q8,q0
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#16]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	vst1.32	{q8},[r1,:128]!
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vext.8	q8,q1,q2,#4
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	vext.8	q9,q3,q0,#4
 	add	r8,r8,r12
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vadd.i32	q1,q1,q9
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#20]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	veor	q9,q9,q10
 	add	r6,r6,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	vshr.u32	d24,d1,#17
 	add	r7,r7,r3
 	and	r2,r2,r11
 	veor	q9,q9,q11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	vsli.32	d24,d1,#15
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	vshr.u32	d25,d1,#10
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	vadd.i32	q1,q1,q9
 	add	r6,r6,r2
 	ldr	r2,[sp,#24]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r10,r10,r6
 	vshr.u32	d24,d1,#19
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	vsli.32	d24,d1,#13
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	veor	d25,d25,d24
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	vadd.i32	d2,d2,d25
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	vshr.u32	d24,d2,#17
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	vsli.32	d24,d2,#15
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	vshr.u32	d25,d2,#10
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#28]
 	and	r3,r3,r12
 	vshr.u32	d24,d2,#19
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	vld1.32	{q8},[r14,:128]!
 	add	r4,r4,r2
 	vsli.32	d24,d2,#13
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	veor	d25,d25,d24
 	add	r5,r5,r3
 	and	r2,r2,r9
 	vadd.i32	d3,d3,d25
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	vadd.i32	q8,q8,q1
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[sp,#32]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	vst1.32	{q8},[r1,:128]!
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	vext.8	q8,q2,q3,#4
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	vext.8	q9,q0,q1,#4
 	add	r4,r4,r12
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vadd.i32	q2,q2,q9
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#36]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	veor	q9,q9,q10
 	add	r10,r10,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	vshr.u32	d24,d3,#17
 	add	r11,r11,r3
 	and	r2,r2,r7
 	veor	q9,q9,q11
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	vsli.32	d24,d3,#15
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	vshr.u32	d25,d3,#10
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	vadd.i32	q2,q2,q9
 	add	r10,r10,r2
 	ldr	r2,[sp,#40]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r6,r6,r10
 	vshr.u32	d24,d3,#19
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	vsli.32	d24,d3,#13
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	veor	d25,d25,d24
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	vadd.i32	d4,d4,d25
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	vshr.u32	d24,d4,#17
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	vsli.32	d24,d4,#15
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	vshr.u32	d25,d4,#10
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#44]
 	and	r3,r3,r12
 	vshr.u32	d24,d4,#19
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	vld1.32	{q8},[r14,:128]!
 	add	r8,r8,r2
 	vsli.32	d24,d4,#13
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	veor	d25,d25,d24
 	add	r9,r9,r3
 	and	r2,r2,r5
 	vadd.i32	d5,d5,d25
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	vadd.i32	q8,q8,q2
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#48]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	vst1.32	{q8},[r1,:128]!
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vext.8	q8,q3,q0,#4
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	vext.8	q9,q1,q2,#4
 	add	r8,r8,r12
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	vshr.u32	q10,q8,#7
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vadd.i32	q3,q3,q9
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	vshr.u32	q9,q8,#3
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vsli.32	q10,q8,#25
 	ldr	r2,[sp,#52]
 	and	r3,r3,r12
 	vshr.u32	q11,q8,#18
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	veor	q9,q9,q10
 	add	r6,r6,r2
 	vsli.32	q11,q8,#14
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	vshr.u32	d24,d5,#17
 	add	r7,r7,r3
 	and	r2,r2,r11
 	veor	q9,q9,q11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	vsli.32	d24,d5,#15
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	vshr.u32	d25,d5,#10
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	vadd.i32	q3,q3,q9
 	add	r6,r6,r2
 	ldr	r2,[sp,#56]
 	veor	d25,d25,d24
 	and	r12,r12,r3
 	add	r10,r10,r6
 	vshr.u32	d24,d5,#19
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	vsli.32	d24,d5,#13
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	veor	d25,d25,d24
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	vadd.i32	d6,d6,d25
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	vshr.u32	d24,d6,#17
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	vsli.32	d24,d6,#15
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	vshr.u32	d25,d6,#10
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	veor	d25,d25,d24
 	ldr	r2,[sp,#60]
 	and	r3,r3,r12
 	vshr.u32	d24,d6,#19
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	vld1.32	{q8},[r14,:128]!
 	add	r4,r4,r2
 	vsli.32	d24,d6,#13
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	veor	d25,d25,d24
 	add	r5,r5,r3
 	and	r2,r2,r9
 	vadd.i32	d7,d7,d25
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	vadd.i32	q8,q8,q3
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[r14]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	vst1.32	{q8},[r1,:128]!
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	teq	r2,#0				@ check for K256 terminator
 	ldr	r2,[sp,#0]
 	sub	r1,r1,#64
 	bne	.L_00_48
 
 	ldr		r1,[sp,#68]
 	ldr		r0,[sp,#72]
 	sub		r14,r14,#256	@ rewind r14
 	teq		r1,r0
 	it		eq
 	subeq		r1,r1,#64		@ avoid SEGV
 	vld1.8		{q0},[r1]!		@ load next input block
 	vld1.8		{q1},[r1]!
 	vld1.8		{q2},[r1]!
 	vld1.8		{q3},[r1]!
 	it		ne
 	strne		r1,[sp,#68]
 	mov		r1,sp
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vrev32.8	q0,q0
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vadd.i32	q8,q8,q0
 	ldr	r2,[sp,#4]
 	and	r3,r3,r12
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	add	r10,r10,r2
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3
 	and	r2,r2,r7
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	add	r10,r10,r2
 	ldr	r2,[sp,#8]
 	and	r12,r12,r3
 	add	r6,r6,r10
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	ldr	r2,[sp,#12]
 	and	r3,r3,r12
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	add	r8,r8,r2
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3
 	and	r2,r2,r5
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#16]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vst1.32	{q8},[r1,:128]!
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vrev32.8	q1,q1
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vadd.i32	q8,q8,q1
 	ldr	r2,[sp,#20]
 	and	r3,r3,r12
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	add	r6,r6,r2
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3
 	and	r2,r2,r11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	add	r6,r6,r2
 	ldr	r2,[sp,#24]
 	and	r12,r12,r3
 	add	r10,r10,r6
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	ldr	r2,[sp,#28]
 	and	r3,r3,r12
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	add	r4,r4,r2
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3
 	and	r2,r2,r9
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[sp,#32]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	vst1.32	{q8},[r1,:128]!
 	add	r11,r11,r2
 	eor	r2,r9,r10
 	eor	r0,r8,r8,ror#5
 	add	r4,r4,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r8
 	eor	r12,r0,r8,ror#19
 	eor	r0,r4,r4,ror#11
 	eor	r2,r2,r10
 	vrev32.8	q2,q2
 	add	r11,r11,r12,ror#6
 	eor	r12,r4,r5
 	eor	r0,r0,r4,ror#20
 	add	r11,r11,r2
 	vadd.i32	q8,q8,q2
 	ldr	r2,[sp,#36]
 	and	r3,r3,r12
 	add	r7,r7,r11
 	add	r11,r11,r0,ror#2
 	eor	r3,r3,r5
 	add	r10,r10,r2
 	eor	r2,r8,r9
 	eor	r0,r7,r7,ror#5
 	add	r11,r11,r3
 	and	r2,r2,r7
 	eor	r3,r0,r7,ror#19
 	eor	r0,r11,r11,ror#11
 	eor	r2,r2,r9
 	add	r10,r10,r3,ror#6
 	eor	r3,r11,r4
 	eor	r0,r0,r11,ror#20
 	add	r10,r10,r2
 	ldr	r2,[sp,#40]
 	and	r12,r12,r3
 	add	r6,r6,r10
 	add	r10,r10,r0,ror#2
 	eor	r12,r12,r4
 	add	r9,r9,r2
 	eor	r2,r7,r8
 	eor	r0,r6,r6,ror#5
 	add	r10,r10,r12
 	and	r2,r2,r6
 	eor	r12,r0,r6,ror#19
 	eor	r0,r10,r10,ror#11
 	eor	r2,r2,r8
 	add	r9,r9,r12,ror#6
 	eor	r12,r10,r11
 	eor	r0,r0,r10,ror#20
 	add	r9,r9,r2
 	ldr	r2,[sp,#44]
 	and	r3,r3,r12
 	add	r5,r5,r9
 	add	r9,r9,r0,ror#2
 	eor	r3,r3,r11
 	add	r8,r8,r2
 	eor	r2,r6,r7
 	eor	r0,r5,r5,ror#5
 	add	r9,r9,r3
 	and	r2,r2,r5
 	eor	r3,r0,r5,ror#19
 	eor	r0,r9,r9,ror#11
 	eor	r2,r2,r7
 	add	r8,r8,r3,ror#6
 	eor	r3,r9,r10
 	eor	r0,r0,r9,ror#20
 	add	r8,r8,r2
 	ldr	r2,[sp,#48]
 	and	r12,r12,r3
 	add	r4,r4,r8
 	add	r8,r8,r0,ror#2
 	eor	r12,r12,r10
 	vst1.32	{q8},[r1,:128]!
 	add	r7,r7,r2
 	eor	r2,r5,r6
 	eor	r0,r4,r4,ror#5
 	add	r8,r8,r12
 	vld1.32	{q8},[r14,:128]!
 	and	r2,r2,r4
 	eor	r12,r0,r4,ror#19
 	eor	r0,r8,r8,ror#11
 	eor	r2,r2,r6
 	vrev32.8	q3,q3
 	add	r7,r7,r12,ror#6
 	eor	r12,r8,r9
 	eor	r0,r0,r8,ror#20
 	add	r7,r7,r2
 	vadd.i32	q8,q8,q3
 	ldr	r2,[sp,#52]
 	and	r3,r3,r12
 	add	r11,r11,r7
 	add	r7,r7,r0,ror#2
 	eor	r3,r3,r9
 	add	r6,r6,r2
 	eor	r2,r4,r5
 	eor	r0,r11,r11,ror#5
 	add	r7,r7,r3
 	and	r2,r2,r11
 	eor	r3,r0,r11,ror#19
 	eor	r0,r7,r7,ror#11
 	eor	r2,r2,r5
 	add	r6,r6,r3,ror#6
 	eor	r3,r7,r8
 	eor	r0,r0,r7,ror#20
 	add	r6,r6,r2
 	ldr	r2,[sp,#56]
 	and	r12,r12,r3
 	add	r10,r10,r6
 	add	r6,r6,r0,ror#2
 	eor	r12,r12,r8
 	add	r5,r5,r2
 	eor	r2,r11,r4
 	eor	r0,r10,r10,ror#5
 	add	r6,r6,r12
 	and	r2,r2,r10
 	eor	r12,r0,r10,ror#19
 	eor	r0,r6,r6,ror#11
 	eor	r2,r2,r4
 	add	r5,r5,r12,ror#6
 	eor	r12,r6,r7
 	eor	r0,r0,r6,ror#20
 	add	r5,r5,r2
 	ldr	r2,[sp,#60]
 	and	r3,r3,r12
 	add	r9,r9,r5
 	add	r5,r5,r0,ror#2
 	eor	r3,r3,r7
 	add	r4,r4,r2
 	eor	r2,r10,r11
 	eor	r0,r9,r9,ror#5
 	add	r5,r5,r3
 	and	r2,r2,r9
 	eor	r3,r0,r9,ror#19
 	eor	r0,r5,r5,ror#11
 	eor	r2,r2,r11
 	add	r4,r4,r3,ror#6
 	eor	r3,r5,r6
 	eor	r0,r0,r5,ror#20
 	add	r4,r4,r2
 	ldr	r2,[sp,#64]
 	and	r12,r12,r3
 	add	r8,r8,r4
 	add	r4,r4,r0,ror#2
 	eor	r12,r12,r6
 	vst1.32	{q8},[r1,:128]!
 	ldr	r0,[r2,#0]
 	add	r4,r4,r12			@ h+=Maj(a,b,c) from the past
 	ldr	r12,[r2,#4]
 	ldr	r3,[r2,#8]
 	ldr	r1,[r2,#12]
 	add	r4,r4,r0			@ accumulate
 	ldr	r0,[r2,#16]
 	add	r5,r5,r12
 	ldr	r12,[r2,#20]
 	add	r6,r6,r3
 	ldr	r3,[r2,#24]
 	add	r7,r7,r1
 	ldr	r1,[r2,#28]
 	add	r8,r8,r0
 	str	r4,[r2],#4
 	add	r9,r9,r12
 	str	r5,[r2],#4
 	add	r10,r10,r3
 	str	r6,[r2],#4
 	add	r11,r11,r1
 	str	r7,[r2],#4
 	stmia	r2,{r8-r11}
 
 	ittte	ne
 	movne	r1,sp
 	ldrne	r2,[sp,#0]
 	eorne	r12,r12,r12
 	ldreq	sp,[sp,#76]			@ restore original sp
 	itt	ne
 	eorne	r3,r5,r6
 	bne	.L_00_48
 
 	ldmia	sp!,{r4-r12,pc}
 .size	zfs_sha256_block_neon,.-zfs_sha256_block_neon
 
 # if defined(__thumb2__)
 #  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
 # else
 #  define INST(a,b,c,d)	.byte	a,b,c,d
 # endif
 
 .globl	zfs_sha256_block_armv8
 .type	zfs_sha256_block_armv8,%function
 .align	5
 zfs_sha256_block_armv8:
 .LARMv8:
 	vld1.32	{q0,q1},[r0]
 	sub	r3,r3,#256+32
 	add	r2,r1,r2,lsl#6	@ len to point at the end of inp
 	b	.Loop_v8
 
 .align	4
 .Loop_v8:
 	vld1.8		{q8-q9},[r1]!
 	vld1.8		{q10-q11},[r1]!
 	vld1.32		{q12},[r3]!
 	vrev32.8	q8,q8
 	vrev32.8	q9,q9
 	vrev32.8	q10,q10
 	vrev32.8	q11,q11
 	vmov		q14,q0	@ offload
 	vmov		q15,q1
 	teq		r1,r2
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q10
 	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q11
 	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q10
 	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q11
 	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	INST(0xe2,0x03,0xfa,0xf3)	@ sha256su0 q8,q9
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe6,0x0c,0x64,0xf3)	@ sha256su1 q8,q10,q11
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	INST(0xe4,0x23,0xfa,0xf3)	@ sha256su0 q9,q10
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe0,0x2c,0x66,0xf3)	@ sha256su1 q9,q11,q8
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q10
 	INST(0xe6,0x43,0xfa,0xf3)	@ sha256su0 q10,q11
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 	INST(0xe2,0x4c,0x60,0xf3)	@ sha256su1 q10,q8,q9
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q11
 	INST(0xe0,0x63,0xfa,0xf3)	@ sha256su0 q11,q8
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 	INST(0xe4,0x6c,0x62,0xf3)	@ sha256su1 q11,q9,q10
 	vld1.32		{q13},[r3]!
 	vadd.i32	q12,q12,q8
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 
 	vld1.32		{q12},[r3]!
 	vadd.i32	q13,q13,q9
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 
 	vld1.32		{q13},[r3]
 	vadd.i32	q12,q12,q10
 	sub		r3,r3,#256-16	@ rewind
 	vmov		q2,q0
 	INST(0x68,0x0c,0x02,0xf3)	@ sha256h q0,q1,q12
 	INST(0x68,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q12
 
 	vadd.i32	q13,q13,q11
 	vmov		q2,q0
 	INST(0x6a,0x0c,0x02,0xf3)	@ sha256h q0,q1,q13
 	INST(0x6a,0x2c,0x14,0xf3)	@ sha256h2 q1,q2,q13
 
 	vadd.i32	q0,q0,q14
 	vadd.i32	q1,q1,q15
 	it		ne
 	bne		.Loop_v8
 
 	vst1.32		{q0,q1},[r0]
 
 	bx	lr		@ bx lr
 .size	zfs_sha256_block_armv8,.-zfs_sha256_block_armv8
 
-#endif
+#endif // #if __ARM_ARCH__ >= 7
+#endif // #if defined(__arm__)
diff --git a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S
index a4c804033b92..66d7dd3cf0f7 100644
--- a/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S
+++ b/sys/contrib/openzfs/module/icp/asm-arm/sha2/sha512-armv7.S
@@ -1,1822 +1,1827 @@
 /*
  * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *     https://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 /*
  * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  * - modified assembly to fit into OpenZFS
  */
 
 #if defined(__arm__)
 
-#define	__ARM_ARCH__      7
-#define	__ARM_MAX_ARCH__  7
+#ifndef __ARM_ARCH
+# define __ARM_ARCH__	7
+#else
+# define __ARM_ARCH__	__ARM_ARCH
+#endif
 
 #ifndef __KERNEL__
 # define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
 # define VFP_ABI_POP	vldmia	sp!,{d8-d15}
 #else
 # define VFP_ABI_PUSH
 # define VFP_ABI_POP
 #endif
 
 #ifdef __ARMEL__
 # define LO 0
 # define HI 4
 # define WORD64(hi0,lo0,hi1,lo1)	.word	lo0,hi0, lo1,hi1
 #else
 # define HI 0
 # define LO 4
 # define WORD64(hi0,lo0,hi1,lo1)	.word	hi0,lo0, hi1,lo1
 #endif
 
 #if defined(__thumb2__)
 .syntax	unified
 .thumb
 # define adrl adr
 #else
 .code	32
 #endif
 
 .text
 
 .type	K512,%object
 .align	5
 K512:
 	WORD64(0x428a2f98,0xd728ae22,	0x71374491,0x23ef65cd)
 	WORD64(0xb5c0fbcf,0xec4d3b2f,	0xe9b5dba5,0x8189dbbc)
 	WORD64(0x3956c25b,0xf348b538,	0x59f111f1,0xb605d019)
 	WORD64(0x923f82a4,0xaf194f9b,	0xab1c5ed5,0xda6d8118)
 	WORD64(0xd807aa98,0xa3030242,	0x12835b01,0x45706fbe)
 	WORD64(0x243185be,0x4ee4b28c,	0x550c7dc3,0xd5ffb4e2)
 	WORD64(0x72be5d74,0xf27b896f,	0x80deb1fe,0x3b1696b1)
 	WORD64(0x9bdc06a7,0x25c71235,	0xc19bf174,0xcf692694)
 	WORD64(0xe49b69c1,0x9ef14ad2,	0xefbe4786,0x384f25e3)
 	WORD64(0x0fc19dc6,0x8b8cd5b5,	0x240ca1cc,0x77ac9c65)
 	WORD64(0x2de92c6f,0x592b0275,	0x4a7484aa,0x6ea6e483)
 	WORD64(0x5cb0a9dc,0xbd41fbd4,	0x76f988da,0x831153b5)
 	WORD64(0x983e5152,0xee66dfab,	0xa831c66d,0x2db43210)
 	WORD64(0xb00327c8,0x98fb213f,	0xbf597fc7,0xbeef0ee4)
 	WORD64(0xc6e00bf3,0x3da88fc2,	0xd5a79147,0x930aa725)
 	WORD64(0x06ca6351,0xe003826f,	0x14292967,0x0a0e6e70)
 	WORD64(0x27b70a85,0x46d22ffc,	0x2e1b2138,0x5c26c926)
 	WORD64(0x4d2c6dfc,0x5ac42aed,	0x53380d13,0x9d95b3df)
 	WORD64(0x650a7354,0x8baf63de,	0x766a0abb,0x3c77b2a8)
 	WORD64(0x81c2c92e,0x47edaee6,	0x92722c85,0x1482353b)
 	WORD64(0xa2bfe8a1,0x4cf10364,	0xa81a664b,0xbc423001)
 	WORD64(0xc24b8b70,0xd0f89791,	0xc76c51a3,0x0654be30)
 	WORD64(0xd192e819,0xd6ef5218,	0xd6990624,0x5565a910)
 	WORD64(0xf40e3585,0x5771202a,	0x106aa070,0x32bbd1b8)
 	WORD64(0x19a4c116,0xb8d2d0c8,	0x1e376c08,0x5141ab53)
 	WORD64(0x2748774c,0xdf8eeb99,	0x34b0bcb5,0xe19b48a8)
 	WORD64(0x391c0cb3,0xc5c95a63,	0x4ed8aa4a,0xe3418acb)
 	WORD64(0x5b9cca4f,0x7763e373,	0x682e6ff3,0xd6b2b8a3)
 	WORD64(0x748f82ee,0x5defb2fc,	0x78a5636f,0x43172f60)
 	WORD64(0x84c87814,0xa1f0ab72,	0x8cc70208,0x1a6439ec)
 	WORD64(0x90befffa,0x23631e28,	0xa4506ceb,0xde82bde9)
 	WORD64(0xbef9a3f7,0xb2c67915,	0xc67178f2,0xe372532b)
 	WORD64(0xca273ece,0xea26619c,	0xd186b8c7,0x21c0c207)
 	WORD64(0xeada7dd6,0xcde0eb1e,	0xf57d4f7f,0xee6ed178)
 	WORD64(0x06f067aa,0x72176fba,	0x0a637dc5,0xa2c898a6)
 	WORD64(0x113f9804,0xbef90dae,	0x1b710b35,0x131c471b)
 	WORD64(0x28db77f5,0x23047d84,	0x32caab7b,0x40c72493)
 	WORD64(0x3c9ebe0a,0x15c9bebc,	0x431d67c4,0x9c100d4c)
 	WORD64(0x4cc5d4be,0xcb3e42b6,	0x597f299c,0xfc657e2a)
 	WORD64(0x5fcb6fab,0x3ad6faec,	0x6c44198c,0x4a475817)
 .size	K512,.-K512
 .word	0				@ terminator
 
 .align	5
 .globl	zfs_sha512_block_armv7
 .type	zfs_sha512_block_armv7,%function
 zfs_sha512_block_armv7:
 .Lzfs_sha512_block_armv7:
 
 #if __ARM_ARCH__<7 && !defined(__thumb2__)
 	sub	r3,pc,#8		@ zfs_sha512_block_armv7
 #else
 	adr	r3,.Lzfs_sha512_block_armv7
 #endif
 
 	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
 	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
 	sub	r14,r3,#672		@ K512
 	sub	sp,sp,#9*8
 
 	ldr	r7,[r0,#32+LO]
 	ldr	r8,[r0,#32+HI]
 	ldr	r9, [r0,#48+LO]
 	ldr	r10, [r0,#48+HI]
 	ldr	r11, [r0,#56+LO]
 	ldr	r12, [r0,#56+HI]
 .Loop:
 	str	r9, [sp,#48+0]
 	str	r10, [sp,#48+4]
 	str	r11, [sp,#56+0]
 	str	r12, [sp,#56+4]
 	ldr	r5,[r0,#0+LO]
 	ldr	r6,[r0,#0+HI]
 	ldr	r3,[r0,#8+LO]
 	ldr	r4,[r0,#8+HI]
 	ldr	r9, [r0,#16+LO]
 	ldr	r10, [r0,#16+HI]
 	ldr	r11, [r0,#24+LO]
 	ldr	r12, [r0,#24+HI]
 	str	r3,[sp,#8+0]
 	str	r4,[sp,#8+4]
 	str	r9, [sp,#16+0]
 	str	r10, [sp,#16+4]
 	str	r11, [sp,#24+0]
 	str	r12, [sp,#24+4]
 	ldr	r3,[r0,#40+LO]
 	ldr	r4,[r0,#40+HI]
 	str	r3,[sp,#40+0]
 	str	r4,[sp,#40+4]
 
 .L00_15:
 #if __ARM_ARCH__<7
 	ldrb	r3,[r1,#7]
 	ldrb	r9, [r1,#6]
 	ldrb	r10, [r1,#5]
 	ldrb	r11, [r1,#4]
 	ldrb	r4,[r1,#3]
 	ldrb	r12, [r1,#2]
 	orr	r3,r3,r9,lsl#8
 	ldrb	r9, [r1,#1]
 	orr	r3,r3,r10,lsl#16
 	ldrb	r10, [r1],#8
 	orr	r3,r3,r11,lsl#24
 	orr	r4,r4,r12,lsl#8
 	orr	r4,r4,r9,lsl#16
 	orr	r4,r4,r10,lsl#24
 #else
 	ldr	r3,[r1,#4]
 	ldr	r4,[r1],#8
 #ifdef __ARMEL__
 	rev	r3,r3
 	rev	r4,r4
 #endif
 #endif
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
 	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
 	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
 	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
 	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
 	eor	r10,r10,r7,lsl#14
 	eor	r9,r9,r8,lsr#9
 	eor	r10,r10,r7,lsr#9
 	eor	r9,r9,r7,lsl#23
 	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
 	adds	r3,r3,r9
 	ldr	r9,[sp,#40+0]	@ f.lo
 	adc	r4,r4,r10		@ T += Sigma1(e)
 	ldr	r10,[sp,#40+4]	@ f.hi
 	adds	r3,r3,r11
 	ldr	r11,[sp,#48+0]	@ g.lo
 	adc	r4,r4,r12		@ T += h
 	ldr	r12,[sp,#48+4]	@ g.hi
 
 	eor	r9,r9,r11
 	str	r7,[sp,#32+0]
 	eor	r10,r10,r12
 	str	r8,[sp,#32+4]
 	and	r9,r9,r7
 	str	r5,[sp,#0+0]
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
 	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
 	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
 	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
 	ldr	r11,[sp,#8+0]	@ b.lo
 	adc	r8,r8,r4		@ d += T
 	teq	r9,#148
 
 	ldr	r12,[sp,#16+0]	@ c.lo
 #ifdef	__thumb2__
 	it	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 	mov	r9,r5,lsr#28
 	mov	r10,r6,lsr#28
 	eor	r9,r9,r6,lsl#4
 	eor	r10,r10,r5,lsl#4
 	eor	r9,r9,r6,lsr#2
 	eor	r10,r10,r5,lsr#2
 	eor	r9,r9,r5,lsl#30
 	eor	r10,r10,r6,lsl#30
 	eor	r9,r9,r6,lsr#7
 	eor	r10,r10,r5,lsr#7
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
 	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
 	ldr	r10,[sp,#8+4]	@ b.hi
 	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
 	and	r12,r6,r10
 	orr	r6,r6,r10
 	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
 	adds	r5,r5,r3
 	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
 	adc	r6,r6,r4		@ h += T
 	tst	r14,#1
 	add	r14,r14,#8
 	tst	r14,#1
 	beq	.L00_15
 	ldr	r9,[sp,#184+0]
 	ldr	r10,[sp,#184+4]
 	bic	r14,r14,#1
 .L16_79:
 	@ sigma0(x)	(ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
 	@ LO		lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
 	@ HI		hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
 	mov	r3,r9,lsr#1
 	ldr	r11,[sp,#80+0]
 	mov	r4,r10,lsr#1
 	ldr	r12,[sp,#80+4]
 	eor	r3,r3,r10,lsl#31
 	eor	r4,r4,r9,lsl#31
 	eor	r3,r3,r9,lsr#8
 	eor	r4,r4,r10,lsr#8
 	eor	r3,r3,r10,lsl#24
 	eor	r4,r4,r9,lsl#24
 	eor	r3,r3,r9,lsr#7
 	eor	r4,r4,r10,lsr#7
 	eor	r3,r3,r10,lsl#25
 
 	@ sigma1(x)	(ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 	@ LO		lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
 	@ HI		hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
 	mov	r9,r11,lsr#19
 	mov	r10,r12,lsr#19
 	eor	r9,r9,r12,lsl#13
 	eor	r10,r10,r11,lsl#13
 	eor	r9,r9,r12,lsr#29
 	eor	r10,r10,r11,lsr#29
 	eor	r9,r9,r11,lsl#3
 	eor	r10,r10,r12,lsl#3
 	eor	r9,r9,r11,lsr#6
 	eor	r10,r10,r12,lsr#6
 	ldr	r11,[sp,#120+0]
 	eor	r9,r9,r12,lsl#26
 
 	ldr	r12,[sp,#120+4]
 	adds	r3,r3,r9
 	ldr	r9,[sp,#192+0]
 	adc	r4,r4,r10
 
 	ldr	r10,[sp,#192+4]
 	adds	r3,r3,r11
 	adc	r4,r4,r12
 	adds	r3,r3,r9
 	adc	r4,r4,r10
 	@ Sigma1(x)	(ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
 	@ LO		lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
 	@ HI		hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
 	mov	r9,r7,lsr#14
 	str	r3,[sp,#64+0]
 	mov	r10,r8,lsr#14
 	str	r4,[sp,#64+4]
 	eor	r9,r9,r8,lsl#18
 	ldr	r11,[sp,#56+0]	@ h.lo
 	eor	r10,r10,r7,lsl#18
 	ldr	r12,[sp,#56+4]	@ h.hi
 	eor	r9,r9,r7,lsr#18
 	eor	r10,r10,r8,lsr#18
 	eor	r9,r9,r8,lsl#14
 	eor	r10,r10,r7,lsl#14
 	eor	r9,r9,r8,lsr#9
 	eor	r10,r10,r7,lsr#9
 	eor	r9,r9,r7,lsl#23
 	eor	r10,r10,r8,lsl#23	@ Sigma1(e)
 	adds	r3,r3,r9
 	ldr	r9,[sp,#40+0]	@ f.lo
 	adc	r4,r4,r10		@ T += Sigma1(e)
 	ldr	r10,[sp,#40+4]	@ f.hi
 	adds	r3,r3,r11
 	ldr	r11,[sp,#48+0]	@ g.lo
 	adc	r4,r4,r12		@ T += h
 	ldr	r12,[sp,#48+4]	@ g.hi
 
 	eor	r9,r9,r11
 	str	r7,[sp,#32+0]
 	eor	r10,r10,r12
 	str	r8,[sp,#32+4]
 	and	r9,r9,r7
 	str	r5,[sp,#0+0]
 	and	r10,r10,r8
 	str	r6,[sp,#0+4]
 	eor	r9,r9,r11
 	ldr	r11,[r14,#LO]	@ K[i].lo
 	eor	r10,r10,r12		@ Ch(e,f,g)
 	ldr	r12,[r14,#HI]	@ K[i].hi
 
 	adds	r3,r3,r9
 	ldr	r7,[sp,#24+0]	@ d.lo
 	adc	r4,r4,r10		@ T += Ch(e,f,g)
 	ldr	r8,[sp,#24+4]	@ d.hi
 	adds	r3,r3,r11
 	and	r9,r11,#0xff
 	adc	r4,r4,r12		@ T += K[i]
 	adds	r7,r7,r3
 	ldr	r11,[sp,#8+0]	@ b.lo
 	adc	r8,r8,r4		@ d += T
 	teq	r9,#23
 
 	ldr	r12,[sp,#16+0]	@ c.lo
 #ifdef	__thumb2__
 	it	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	orreq	r14,r14,#1
 	@ Sigma0(x)	(ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
 	@ LO		lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
 	@ HI		hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
 	mov	r9,r5,lsr#28
 	mov	r10,r6,lsr#28
 	eor	r9,r9,r6,lsl#4
 	eor	r10,r10,r5,lsl#4
 	eor	r9,r9,r6,lsr#2
 	eor	r10,r10,r5,lsr#2
 	eor	r9,r9,r5,lsl#30
 	eor	r10,r10,r6,lsl#30
 	eor	r9,r9,r6,lsr#7
 	eor	r10,r10,r5,lsr#7
 	eor	r9,r9,r5,lsl#25
 	eor	r10,r10,r6,lsl#25	@ Sigma0(a)
 	adds	r3,r3,r9
 	and	r9,r5,r11
 	adc	r4,r4,r10		@ T += Sigma0(a)
 
 	ldr	r10,[sp,#8+4]	@ b.hi
 	orr	r5,r5,r11
 	ldr	r11,[sp,#16+4]	@ c.hi
 	and	r5,r5,r12
 	and	r12,r6,r10
 	orr	r6,r6,r10
 	orr	r5,r5,r9		@ Maj(a,b,c).lo
 	and	r6,r6,r11
 	adds	r5,r5,r3
 	orr	r6,r6,r12		@ Maj(a,b,c).hi
 	sub	sp,sp,#8
 	adc	r6,r6,r4		@ h += T
 	tst	r14,#1
 	add	r14,r14,#8
 #ifdef	__thumb2__
 	ittt	eq			@ Thumb2 thing, sanity check in ARM
 #endif
 	ldreq	r9,[sp,#184+0]
 	ldreq	r10,[sp,#184+4]
 	beq	.L16_79
 	bic	r14,r14,#1
 
 	ldr	r3,[sp,#8+0]
 	ldr	r4,[sp,#8+4]
 	ldr	r9, [r0,#0+LO]
 	ldr	r10, [r0,#0+HI]
 	ldr	r11, [r0,#8+LO]
 	ldr	r12, [r0,#8+HI]
 	adds	r9,r5,r9
 	str	r9, [r0,#0+LO]
 	adc	r10,r6,r10
 	str	r10, [r0,#0+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#8+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#8+HI]
 
 	ldr	r5,[sp,#16+0]
 	ldr	r6,[sp,#16+4]
 	ldr	r3,[sp,#24+0]
 	ldr	r4,[sp,#24+4]
 	ldr	r9, [r0,#16+LO]
 	ldr	r10, [r0,#16+HI]
 	ldr	r11, [r0,#24+LO]
 	ldr	r12, [r0,#24+HI]
 	adds	r9,r5,r9
 	str	r9, [r0,#16+LO]
 	adc	r10,r6,r10
 	str	r10, [r0,#16+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#24+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#24+HI]
 
 	ldr	r3,[sp,#40+0]
 	ldr	r4,[sp,#40+4]
 	ldr	r9, [r0,#32+LO]
 	ldr	r10, [r0,#32+HI]
 	ldr	r11, [r0,#40+LO]
 	ldr	r12, [r0,#40+HI]
 	adds	r7,r7,r9
 	str	r7,[r0,#32+LO]
 	adc	r8,r8,r10
 	str	r8,[r0,#32+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#40+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#40+HI]
 
 	ldr	r5,[sp,#48+0]
 	ldr	r6,[sp,#48+4]
 	ldr	r3,[sp,#56+0]
 	ldr	r4,[sp,#56+4]
 	ldr	r9, [r0,#48+LO]
 	ldr	r10, [r0,#48+HI]
 	ldr	r11, [r0,#56+LO]
 	ldr	r12, [r0,#56+HI]
 	adds	r9,r5,r9
 	str	r9, [r0,#48+LO]
 	adc	r10,r6,r10
 	str	r10, [r0,#48+HI]
 	adds	r11,r3,r11
 	str	r11, [r0,#56+LO]
 	adc	r12,r4,r12
 	str	r12, [r0,#56+HI]
 
 	add	sp,sp,#640
 	sub	r14,r14,#640
 
 	teq	r1,r2
 	bne	.Loop
 
 	add	sp,sp,#8*9		@ destroy frame
 
 #if __ARM_ARCH__>=5
 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc}
 #else
 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}
 	tst	lr,#1
 	moveq	pc,lr			@ be binary compatible with V4, yet
 .word	0xe12fff1e			@ interoperable with Thumb ISA:-)
 #endif
 .size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
 
+#if __ARM_ARCH__ >= 7
 .arch	armv7-a
 .fpu	neon
 
 .globl	zfs_sha512_block_neon
 .type	zfs_sha512_block_neon,%function
 .align	4
 zfs_sha512_block_neon:
 .LNEON:
 	dmb	@ errata #451034 on early Cortex A8
 	add	r2,r1,r2,lsl#7	@ len to point at the end of inp
 	adr	r3,K512
 	VFP_ABI_PUSH
 	vldmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}		@ load context
 .Loop_neon:
 	vshr.u64	d24,d20,#14	@ 0
 #if 0<16
 	vld1.64	{d0},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d20,#18
 #if 0>0
 	vadd.i64	d16,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d20,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 0<16 && defined(__ARMEL__)
 	vrev64.8	d0,d0
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d0
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 1
 #if 1<16
 	vld1.64	{d1},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 1>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 1<16 && defined(__ARMEL__)
 	vrev64.8	d1,d1
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d1
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	d24,d18,#14	@ 2
 #if 2<16
 	vld1.64	{d2},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d18,#18
 #if 2>0
 	vadd.i64	d22,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d18,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 2<16 && defined(__ARMEL__)
 	vrev64.8	d2,d2
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d2
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 3
 #if 3<16
 	vld1.64	{d3},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 3>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 3<16 && defined(__ARMEL__)
 	vrev64.8	d3,d3
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d3
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	d24,d16,#14	@ 4
 #if 4<16
 	vld1.64	{d4},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d16,#18
 #if 4>0
 	vadd.i64	d20,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d16,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 4<16 && defined(__ARMEL__)
 	vrev64.8	d4,d4
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d4
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 5
 #if 5<16
 	vld1.64	{d5},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 5>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 5<16 && defined(__ARMEL__)
 	vrev64.8	d5,d5
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d5
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	d24,d22,#14	@ 6
 #if 6<16
 	vld1.64	{d6},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d22,#18
 #if 6>0
 	vadd.i64	d18,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d22,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 6<16 && defined(__ARMEL__)
 	vrev64.8	d6,d6
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d6
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 7
 #if 7<16
 	vld1.64	{d7},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 7>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 7<16 && defined(__ARMEL__)
 	vrev64.8	d7,d7
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d7
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	vshr.u64	d24,d20,#14	@ 8
 #if 8<16
 	vld1.64	{d8},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d20,#18
 #if 8>0
 	vadd.i64	d16,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d20,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 8<16 && defined(__ARMEL__)
 	vrev64.8	d8,d8
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d8
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 9
 #if 9<16
 	vld1.64	{d9},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 9>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 9<16 && defined(__ARMEL__)
 	vrev64.8	d9,d9
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d9
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	d24,d18,#14	@ 10
 #if 10<16
 	vld1.64	{d10},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d18,#18
 #if 10>0
 	vadd.i64	d22,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d18,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 10<16 && defined(__ARMEL__)
 	vrev64.8	d10,d10
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d10
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 11
 #if 11<16
 	vld1.64	{d11},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 11>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 11<16 && defined(__ARMEL__)
 	vrev64.8	d11,d11
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d11
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	d24,d16,#14	@ 12
 #if 12<16
 	vld1.64	{d12},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d16,#18
 #if 12>0
 	vadd.i64	d20,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d16,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 12<16 && defined(__ARMEL__)
 	vrev64.8	d12,d12
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d12
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 13
 #if 13<16
 	vld1.64	{d13},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 13>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 13<16 && defined(__ARMEL__)
 	vrev64.8	d13,d13
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d13
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	d24,d22,#14	@ 14
 #if 14<16
 	vld1.64	{d14},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d22,#18
 #if 14>0
 	vadd.i64	d18,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d22,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 14<16 && defined(__ARMEL__)
 	vrev64.8	d14,d14
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d14
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 15
 #if 15<16
 	vld1.64	{d15},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 15>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 15<16 && defined(__ARMEL__)
 	vrev64.8	d15,d15
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d15
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	mov	r12,#4
 .L16_79_neon:
 	subs	r12,#1
 	vshr.u64	q12,q7,#19
 	vshr.u64	q13,q7,#61
 	vadd.i64	d16,d30			@ h+=Maj from the past
 	vshr.u64	q15,q7,#6
 	vsli.64	q12,q7,#45
 	vext.8	q14,q0,q1,#8	@ X[i+1]
 	vsli.64	q13,q7,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q0,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q4,q5,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d20,#14		@ from NEON_00_15
 	vadd.i64	q0,q14
 	vshr.u64	d25,d20,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d20,#41		@ from NEON_00_15
 	vadd.i64	q0,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 16<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d0
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 17
 #if 17<16
 	vld1.64	{d1},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 17>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 17<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d1
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	q12,q0,#19
 	vshr.u64	q13,q0,#61
 	vadd.i64	d22,d30			@ h+=Maj from the past
 	vshr.u64	q15,q0,#6
 	vsli.64	q12,q0,#45
 	vext.8	q14,q1,q2,#8	@ X[i+1]
 	vsli.64	q13,q0,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q1,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q5,q6,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d18,#14		@ from NEON_00_15
 	vadd.i64	q1,q14
 	vshr.u64	d25,d18,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d18,#41		@ from NEON_00_15
 	vadd.i64	q1,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 18<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d2
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 19
 #if 19<16
 	vld1.64	{d3},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 19>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 19<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d3
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	q12,q1,#19
 	vshr.u64	q13,q1,#61
 	vadd.i64	d20,d30			@ h+=Maj from the past
 	vshr.u64	q15,q1,#6
 	vsli.64	q12,q1,#45
 	vext.8	q14,q2,q3,#8	@ X[i+1]
 	vsli.64	q13,q1,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q2,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q6,q7,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d16,#14		@ from NEON_00_15
 	vadd.i64	q2,q14
 	vshr.u64	d25,d16,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d16,#41		@ from NEON_00_15
 	vadd.i64	q2,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 20<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d4
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 21
 #if 21<16
 	vld1.64	{d5},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 21>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 21<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d5
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	q12,q2,#19
 	vshr.u64	q13,q2,#61
 	vadd.i64	d18,d30			@ h+=Maj from the past
 	vshr.u64	q15,q2,#6
 	vsli.64	q12,q2,#45
 	vext.8	q14,q3,q4,#8	@ X[i+1]
 	vsli.64	q13,q2,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q3,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q7,q0,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d22,#14		@ from NEON_00_15
 	vadd.i64	q3,q14
 	vshr.u64	d25,d22,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d22,#41		@ from NEON_00_15
 	vadd.i64	q3,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 22<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d6
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 23
 #if 23<16
 	vld1.64	{d7},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 23>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 23<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d7
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	vshr.u64	q12,q3,#19
 	vshr.u64	q13,q3,#61
 	vadd.i64	d16,d30			@ h+=Maj from the past
 	vshr.u64	q15,q3,#6
 	vsli.64	q12,q3,#45
 	vext.8	q14,q4,q5,#8	@ X[i+1]
 	vsli.64	q13,q3,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q4,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q0,q1,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d20,#14		@ from NEON_00_15
 	vadd.i64	q4,q14
 	vshr.u64	d25,d20,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d20,#41		@ from NEON_00_15
 	vadd.i64	q4,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d20,#50
 	vsli.64	d25,d20,#46
 	vmov	d29,d20
 	vsli.64	d26,d20,#23
 #if 24<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d21,d22		@ Ch(e,f,g)
 	vshr.u64	d24,d16,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d23
 	vshr.u64	d25,d16,#34
 	vsli.64	d24,d16,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d16,#39
 	vadd.i64	d28,d8
 	vsli.64	d25,d16,#30
 	veor	d30,d16,d17
 	vsli.64	d26,d16,#25
 	veor	d23,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d18,d17		@ Maj(a,b,c)
 	veor	d23,d26			@ Sigma0(a)
 	vadd.i64	d19,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d23,d30
 	vshr.u64	d24,d19,#14	@ 25
 #if 25<16
 	vld1.64	{d9},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d19,#18
 #if 25>0
 	vadd.i64	d23,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d19,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d19,#50
 	vsli.64	d25,d19,#46
 	vmov	d29,d19
 	vsli.64	d26,d19,#23
 #if 25<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d20,d21		@ Ch(e,f,g)
 	vshr.u64	d24,d23,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d22
 	vshr.u64	d25,d23,#34
 	vsli.64	d24,d23,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d23,#39
 	vadd.i64	d28,d9
 	vsli.64	d25,d23,#30
 	veor	d30,d23,d16
 	vsli.64	d26,d23,#25
 	veor	d22,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d17,d16		@ Maj(a,b,c)
 	veor	d22,d26			@ Sigma0(a)
 	vadd.i64	d18,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d22,d30
 	vshr.u64	q12,q4,#19
 	vshr.u64	q13,q4,#61
 	vadd.i64	d22,d30			@ h+=Maj from the past
 	vshr.u64	q15,q4,#6
 	vsli.64	q12,q4,#45
 	vext.8	q14,q5,q6,#8	@ X[i+1]
 	vsli.64	q13,q4,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q5,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q1,q2,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d18,#14		@ from NEON_00_15
 	vadd.i64	q5,q14
 	vshr.u64	d25,d18,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d18,#41		@ from NEON_00_15
 	vadd.i64	q5,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d18,#50
 	vsli.64	d25,d18,#46
 	vmov	d29,d18
 	vsli.64	d26,d18,#23
 #if 26<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d19,d20		@ Ch(e,f,g)
 	vshr.u64	d24,d22,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d21
 	vshr.u64	d25,d22,#34
 	vsli.64	d24,d22,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d22,#39
 	vadd.i64	d28,d10
 	vsli.64	d25,d22,#30
 	veor	d30,d22,d23
 	vsli.64	d26,d22,#25
 	veor	d21,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d16,d23		@ Maj(a,b,c)
 	veor	d21,d26			@ Sigma0(a)
 	vadd.i64	d17,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d21,d30
 	vshr.u64	d24,d17,#14	@ 27
 #if 27<16
 	vld1.64	{d11},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d17,#18
 #if 27>0
 	vadd.i64	d21,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d17,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d17,#50
 	vsli.64	d25,d17,#46
 	vmov	d29,d17
 	vsli.64	d26,d17,#23
 #if 27<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d18,d19		@ Ch(e,f,g)
 	vshr.u64	d24,d21,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d20
 	vshr.u64	d25,d21,#34
 	vsli.64	d24,d21,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d21,#39
 	vadd.i64	d28,d11
 	vsli.64	d25,d21,#30
 	veor	d30,d21,d22
 	vsli.64	d26,d21,#25
 	veor	d20,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d23,d22		@ Maj(a,b,c)
 	veor	d20,d26			@ Sigma0(a)
 	vadd.i64	d16,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d20,d30
 	vshr.u64	q12,q5,#19
 	vshr.u64	q13,q5,#61
 	vadd.i64	d20,d30			@ h+=Maj from the past
 	vshr.u64	q15,q5,#6
 	vsli.64	q12,q5,#45
 	vext.8	q14,q6,q7,#8	@ X[i+1]
 	vsli.64	q13,q5,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q6,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q2,q3,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d16,#14		@ from NEON_00_15
 	vadd.i64	q6,q14
 	vshr.u64	d25,d16,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d16,#41		@ from NEON_00_15
 	vadd.i64	q6,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d16,#50
 	vsli.64	d25,d16,#46
 	vmov	d29,d16
 	vsli.64	d26,d16,#23
 #if 28<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d17,d18		@ Ch(e,f,g)
 	vshr.u64	d24,d20,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d19
 	vshr.u64	d25,d20,#34
 	vsli.64	d24,d20,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d20,#39
 	vadd.i64	d28,d12
 	vsli.64	d25,d20,#30
 	veor	d30,d20,d21
 	vsli.64	d26,d20,#25
 	veor	d19,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d22,d21		@ Maj(a,b,c)
 	veor	d19,d26			@ Sigma0(a)
 	vadd.i64	d23,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d19,d30
 	vshr.u64	d24,d23,#14	@ 29
 #if 29<16
 	vld1.64	{d13},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d23,#18
 #if 29>0
 	vadd.i64	d19,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d23,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d23,#50
 	vsli.64	d25,d23,#46
 	vmov	d29,d23
 	vsli.64	d26,d23,#23
 #if 29<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d16,d17		@ Ch(e,f,g)
 	vshr.u64	d24,d19,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d18
 	vshr.u64	d25,d19,#34
 	vsli.64	d24,d19,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d19,#39
 	vadd.i64	d28,d13
 	vsli.64	d25,d19,#30
 	veor	d30,d19,d20
 	vsli.64	d26,d19,#25
 	veor	d18,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d21,d20		@ Maj(a,b,c)
 	veor	d18,d26			@ Sigma0(a)
 	vadd.i64	d22,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d18,d30
 	vshr.u64	q12,q6,#19
 	vshr.u64	q13,q6,#61
 	vadd.i64	d18,d30			@ h+=Maj from the past
 	vshr.u64	q15,q6,#6
 	vsli.64	q12,q6,#45
 	vext.8	q14,q7,q0,#8	@ X[i+1]
 	vsli.64	q13,q6,#3
 	veor	q15,q12
 	vshr.u64	q12,q14,#1
 	veor	q15,q13				@ sigma1(X[i+14])
 	vshr.u64	q13,q14,#8
 	vadd.i64	q7,q15
 	vshr.u64	q15,q14,#7
 	vsli.64	q12,q14,#63
 	vsli.64	q13,q14,#56
 	vext.8	q14,q3,q4,#8	@ X[i+9]
 	veor	q15,q12
 	vshr.u64	d24,d22,#14		@ from NEON_00_15
 	vadd.i64	q7,q14
 	vshr.u64	d25,d22,#18		@ from NEON_00_15
 	veor	q15,q13				@ sigma0(X[i+1])
 	vshr.u64	d26,d22,#41		@ from NEON_00_15
 	vadd.i64	q7,q15
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d22,#50
 	vsli.64	d25,d22,#46
 	vmov	d29,d22
 	vsli.64	d26,d22,#23
 #if 30<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d23,d16		@ Ch(e,f,g)
 	vshr.u64	d24,d18,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d17
 	vshr.u64	d25,d18,#34
 	vsli.64	d24,d18,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d18,#39
 	vadd.i64	d28,d14
 	vsli.64	d25,d18,#30
 	veor	d30,d18,d19
 	vsli.64	d26,d18,#25
 	veor	d17,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d20,d19		@ Maj(a,b,c)
 	veor	d17,d26			@ Sigma0(a)
 	vadd.i64	d21,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d17,d30
 	vshr.u64	d24,d21,#14	@ 31
 #if 31<16
 	vld1.64	{d15},[r1]!	@ handles unaligned
 #endif
 	vshr.u64	d25,d21,#18
 #if 31>0
 	vadd.i64	d17,d30			@ h+=Maj from the past
 #endif
 	vshr.u64	d26,d21,#41
 	vld1.64	{d28},[r3,:64]!	@ K[i++]
 	vsli.64	d24,d21,#50
 	vsli.64	d25,d21,#46
 	vmov	d29,d21
 	vsli.64	d26,d21,#23
 #if 31<16 && defined(__ARMEL__)
 	vrev64.8	,
 #endif
 	veor	d25,d24
 	vbsl	d29,d22,d23		@ Ch(e,f,g)
 	vshr.u64	d24,d17,#28
 	veor	d26,d25			@ Sigma1(e)
 	vadd.i64	d27,d29,d16
 	vshr.u64	d25,d17,#34
 	vsli.64	d24,d17,#36
 	vadd.i64	d27,d26
 	vshr.u64	d26,d17,#39
 	vadd.i64	d28,d15
 	vsli.64	d25,d17,#30
 	veor	d30,d17,d18
 	vsli.64	d26,d17,#25
 	veor	d16,d24,d25
 	vadd.i64	d27,d28
 	vbsl	d30,d19,d18		@ Maj(a,b,c)
 	veor	d16,d26			@ Sigma0(a)
 	vadd.i64	d20,d27
 	vadd.i64	d30,d27
 	@ vadd.i64	d16,d30
 	bne	.L16_79_neon
 
 	vadd.i64	d16,d30		@ h+=Maj from the past
 	vldmia	r0,{d24,d25,d26,d27,d28,d29,d30,d31}	@ load context to temp
 	vadd.i64	q8,q12		@ vectorized accumulate
 	vadd.i64	q9,q13
 	vadd.i64	q10,q14
 	vadd.i64	q11,q15
 	vstmia	r0,{d16,d17,d18,d19,d20,d21,d22,d23}	@ save context
 	teq	r1,r2
 	sub	r3,#640	@ rewind K512
 	bne	.Loop_neon
 
 	VFP_ABI_POP
 	bx	lr				@ .word	0xe12fff1e
 .size	zfs_sha512_block_neon,.-zfs_sha512_block_neon
-#endif
+#endif // #if __ARM_ARCH__ >= 7
+#endif // #if defined(__arm__)
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 0655142dcea3..715b0ee6419c 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -1,6472 +1,6465 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <security/mac/mac_framework.h>
 #include <sys/vfs.h>
 #include <sys/endian.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #if __FreeBSD_version >= 1300102
 #include <sys/smr.h>
 #endif
 #include <sys/dirent.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/kdb.h>
 #include <sys/sysproto.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_rlock.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <sys/vmmeter.h>
 #include <vm/vm_param.h>
 #include <sys/zil.h>
 #include <sys/zfs_vnops.h>
 #include <sys/module.h>
 #include <sys/sysent.h>
 #include <sys/dmu_impl.h>
 #include <sys/brt.h>
 #include <sys/zfeature.h>
 
 #include <vm/vm_object.h>
 
 #include <sys/extattr.h>
 #include <sys/priv.h>
 
 #ifndef VN_OPEN_INVFS
 #define	VN_OPEN_INVFS	0x0
 #endif
 
 VFS_SMR_DECLARE;
 
 #if __FreeBSD_version < 1300103
 #define	NDFREE_PNBUF(ndp)	NDFREE((ndp), NDF_ONLY_PNBUF)
 #endif
 
 #if __FreeBSD_version >= 1300047
 #define	vm_page_wire_lock(pp)
 #define	vm_page_wire_unlock(pp)
 #else
 #define	vm_page_wire_lock(pp) vm_page_lock(pp)
 #define	vm_page_wire_unlock(pp) vm_page_unlock(pp)
 #endif
 
 #ifdef DEBUG_VFS_LOCKS
 #define	VNCHECKREF(vp)				  \
 	VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,	\
 	    ("%s: wrong ref counts", __func__));
 #else
 #define	VNCHECKREF(vp)
 #endif
 
 #if __FreeBSD_version >= 1400045
 typedef uint64_t cookie_t;
 #else
 typedef ulong_t cookie_t;
 #endif
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using zfs_enter(zfsvfs).
  *	A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with zfs_verify_zp(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and zfs_exit(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	zfs_enter(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		zfs_exit(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	zfs_exit(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 {
 	(void) cr;
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & O_SYNC)
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 	(void) offset, (void) cr;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & O_SYNC) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 static int
 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
     int *rvalp)
 {
 	(void) flag, (void) cred, (void) rvalp;
 	loff_t off;
 	int error;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case F_SEEK_DATA:
 	case F_SEEK_HOLE:
 	{
 		off = *(offset_t *)data;
 		/* offset parameter is in/out */
 		error = zfs_holey(VTOZ(vp), com, &off);
 		if (error)
 			return (error);
 		*(offset_t *)data = off;
 		return (0);
 	}
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t end;
 
 	/*
 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 	 * aligned boundaries, if the range is not aligned.  As a result a
 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
 	 * the whole page would be considered clean despite have some
 	 * dirty data.
 	 * For this reason we should shrink the range to DEV_BSIZE aligned
 	 * boundaries before calling vm_page_clear_dirty.
 	 */
 	end = rounddown2(off + nbytes, DEV_BSIZE);
 	off = roundup2(off, DEV_BSIZE);
 	nbytes = end - off;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked_12(obj);
 #if __FreeBSD_version < 1300050
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb", true);
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 			vm_page_sbusy(pp);
 		} else if (pp != NULL) {
 			ASSERT(!pp->valid);
 			pp = NULL;
 		}
 		if (pp != NULL) {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_object_pip_add(obj, 1);
 			pmap_remove_write(pp);
 			if (nbytes != 0)
 				vm_page_clear_dirty(pp, off, nbytes);
 		}
 		break;
 	}
 #else
 	vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
 	    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
 	    VM_ALLOC_IGN_SBUSY);
 	if (pp != NULL) {
 		ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 		vm_object_pip_add(obj, 1);
 		pmap_remove_write(pp);
 		if (nbytes != 0)
 			vm_page_clear_dirty(pp, off, nbytes);
 	}
 #endif
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_sunbusy(pp);
 #if __FreeBSD_version >= 1300041
 	vm_object_pip_wakeup(pp->object);
 #else
 	vm_object_pip_subtract(pp->object, 1);
 #endif
 }
 
 #if __FreeBSD_version > 1300051
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vp->v_object;
 	vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
 	    VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 	    VM_ALLOC_NOBUSY);
 	return (m);
 }
 #else
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 
 	obj = vp->v_object;
 	zfs_vmobject_assert_wlocked(obj);
 
 	for (;;) {
 		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
 		    pp->valid) {
 			if (vm_page_xbusied(pp)) {
 				/*
 				 * Reference the page before unlocking and
 				 * sleeping so that the page daemon is less
 				 * likely to reclaim it.
 				 */
 				vm_page_reference(pp);
 				vm_page_lock(pp);
 				zfs_vmobject_wunlock(obj);
 				vm_page_busy_sleep(pp, "zfsmwb", true);
 				zfs_vmobject_wlock(obj);
 				continue;
 			}
 
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_wire_lock(pp);
 			vm_page_hold(pp);
 			vm_page_wire_unlock(pp);
 
 		} else
 			pp = NULL;
 		break;
 	}
 	return (pp);
 }
 #endif
 
 static void
 page_unhold(vm_page_t pp)
 {
 
 	vm_page_wire_lock(pp);
 #if __FreeBSD_version >= 1300035
 	vm_page_unwire(pp, PQ_ACTIVE);
 #else
 	vm_page_unhold(pp);
 #endif
 	vm_page_wire_unlock(pp);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 void
 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	vnode_t *vp = ZTOV(zp);
 	caddr_t va;
 	int off;
 
 	ASSERT3P(vp->v_mount, !=, NULL);
 	obj = vp->v_object;
 	ASSERT3P(obj, !=, NULL);
 
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock_12(obj);
 #if __FreeBSD_version >= 1300041
 	vm_object_pip_add(obj, 1);
 #endif
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			zfs_vmobject_wunlock_12(obj);
 
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, zp->z_id, start + off, nbytes,
 			    va + off, DMU_READ_PREFETCH);
 			zfs_unmap_page(sf);
 
 			zfs_vmobject_wlock_12(obj);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 #if __FreeBSD_version >= 1300041
 	vm_object_pip_wakeup(obj);
 #else
 	vm_object_pip_wakeupn(obj, 0);
 #endif
 	zfs_vmobject_wunlock_12(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 int
 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	vnode_t *vp = ZTOV(zp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int error = 0;
 
 	ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
 	ASSERT3P(vp->v_mount, !=, NULL);
 	obj = vp->v_object;
 	ASSERT3P(obj, !=, NULL);
 	ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
 
 	zfs_vmobject_wlock_12(obj);
 	for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
 		    VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (vm_page_none_valid(pp)) {
 			zfs_vmobject_wunlock_12(obj);
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				memset(va + bytes, 0, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock_12(obj);
 #if  __FreeBSD_version >= 1300081
 			if (error == 0) {
 				vm_page_valid(pp);
 				vm_page_activate(pp);
 				vm_page_do_sunbusy(pp);
 			} else {
 				zfs_vmobject_wlock(obj);
 				if (!vm_page_wired(pp) && pp->valid == 0 &&
 				    vm_page_busy_tryupgrade(pp))
 					vm_page_free(pp);
 				else
 					vm_page_sunbusy(pp);
 				zfs_vmobject_wunlock(obj);
 			}
 #else
 			vm_page_do_sunbusy(pp);
 			vm_page_lock(pp);
 			if (error) {
 				if (pp->wire_count == 0 && pp->valid == 0 &&
 				    !vm_page_busied(pp))
 					vm_page_free(pp);
 			} else {
 				pp->valid = VM_PAGE_BITS_ALL;
 				vm_page_activate(pp);
 			}
 			vm_page_unlock(pp);
 #endif
 		} else {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_do_sunbusy(pp);
 		}
 		if (error)
 			break;
 		zfs_uio_advance(uio, bytes);
 		len -= bytes;
 	}
 	zfs_vmobject_wunlock_12(obj);
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 int
 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	vnode_t *vp = ZTOV(zp);
 	vm_object_t obj;
 	int64_t start;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT3P(vp->v_mount, !=, NULL);
 	obj = vp->v_object;
 	ASSERT3P(obj, !=, NULL);
 
 	start = zfs_uio_offset(uio);
 	off = start & PAGEOFFSET;
 	zfs_vmobject_wlock_12(obj);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if ((pp = page_hold(vp, start))) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			zfs_vmobject_wunlock_12(obj);
 			va = zfs_map_page(pp, &sf);
 			error = vn_io_fault_uiomove(va + off, bytes,
 			    GET_UIO_STRUCT(uio));
 			zfs_unmap_page(sf);
 			zfs_vmobject_wlock_12(obj);
 			page_unhold(pp);
 		} else {
 			zfs_vmobject_wunlock_12(obj);
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 			zfs_vmobject_wlock_12(obj);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	zfs_vmobject_wunlock_12(obj);
 	return (error);
 }
 
 int
 zfs_write_simple(znode_t *zp, const void *data, size_t len,
     loff_t pos, size_t *presid)
 {
 	int error = 0;
 	ssize_t resid;
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
 	    UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
 
 	if (error) {
 		return (SET_ERROR(error));
 	} else if (presid == NULL) {
 		if (resid != 0) {
 			error = SET_ERROR(EIO);
 		}
 	} else {
 		*presid = resid;
 	}
 	return (error);
 }
 
 void
 zfs_zrele_async(znode_t *zp)
 {
 	vnode_t *vp = ZTOV(zp);
 	objset_t *os = ITOZSB(vp)->z_os;
 
 	VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
 }
 
 static int
 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	int error;
 
 	*vpp = arg;
 	error = vn_lock(*vpp, lkflags);
 	if (error != 0)
 		vrele(*vpp);
 	return (error);
 }
 
 static int
 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
 	int error;
 	int ltype;
 
 	if (zfsvfs->z_replay == B_FALSE)
 		ASSERT_VOP_LOCKED(dvp, __func__);
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		ASSERT3P(dvp, ==, vp);
 		vref(dvp);
 		ltype = lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(dvp)) {
 			if (ltype == LK_EXCLUSIVE)
 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 			else /* if (ltype == LK_SHARED) */
 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 			/*
 			 * Relock for the "." case could leave us with
 			 * reclaimed vnode.
 			 */
 			if (VN_IS_DOOMED(dvp)) {
 				vrele(dvp);
 				return (SET_ERROR(ENOENT));
 			}
 		}
 		return (0);
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 *
 		 * XXX Note that there is a time window when both vnodes are
 		 * unlocked.  It is possible, although highly unlikely, that
 		 * during that window the parent-child relationship between
 		 * the vnodes may change, for example, get reversed.
 		 * In that case we would have a wrong lock order for the vnodes.
 		 * All other filesystems seem to ignore this problem, so we
 		 * do the same here.
 		 * A potential solution could be implemented as follows:
 		 * - using LK_NOWAIT when locking the second vnode and retrying
 		 *   if necessary
 		 * - checking that the parent-child relationship still holds
 		 *   after locking both vnodes and retrying if it doesn't
 		 */
 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 		return (error);
 	} else {
 		error = vn_lock(vp, lkflags);
 		if (error != 0)
 			vrele(vp);
 		return (error);
 	}
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 static int
 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
     struct componentname *cnp, int nameiop, cred_t *cr, int flags,
     boolean_t cached)
 {
 	znode_t *zdp = VTOZ(dvp);
 	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 #if	__FreeBSD_version > 1300124
 	seqc_t dvp_seqc;
 #endif
 	int	error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
 	    const char *, nm);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 		return (error);
 
 #if	__FreeBSD_version > 1300124
 	dvp_seqc = vn_seqc_read_notmodify(dvp);
 #endif
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_flags & ZSB_XATTR)) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOPNOTSUPP));
 		}
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 		*vpp = ZTOV(zp);
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 		error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr, NULL);
 		if (error) {
 			vrele(ZTOV(zp));
 		}
 
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Check accessibility of directory if we're not coming in via
 	 * VOP_CACHEDLOOKUP.
 	 */
 	if (!cached) {
 #ifdef NOEXECCHECK
 		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 			cnp->cn_flags &= ~NOEXECCHECK;
 		} else
 #endif
 		if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 		    NULL))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 
 	/*
 	 * First handle the special cases.
 	 */
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * If we are a snapshot mounted under .zfs, return
 		 * the vp for the snapshot directory.
 		 */
 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 			struct componentname cn;
 			vnode_t *zfsctl_vp;
 			int ltype;
 
 			zfs_exit(zfsvfs, FTAG);
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK1(dvp);
 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 			    &zfsctl_vp);
 			if (error == 0) {
 				cn.cn_nameptr = "snapshot";
 				cn.cn_namelen = strlen(cn.cn_nameptr);
 				cn.cn_nameiop = cnp->cn_nameiop;
 				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 				cn.cn_lkflags = cnp->cn_lkflags;
 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 				vput(zfsctl_vp);
 			}
 			vn_lock(dvp, ltype | LK_RETRY);
 			return (error);
 		}
 	}
 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 		zfs_exit(zfsvfs, FTAG);
 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 			return (SET_ERROR(ENOTSUP));
 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 		return (error);
 	}
 
 	/*
 	 * The loop is retry the lookup if the parent-child relationship
 	 * changes during the dot-dot locking complexities.
 	 */
 	for (;;) {
 		uint64_t parent;
 
 		error = zfs_dirlook(zdp, nm, &zp);
 		if (error == 0)
 			*vpp = ZTOV(zp);
 
 		zfs_exit(zfsvfs, FTAG);
 		if (error != 0)
 			break;
 
 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 		if (error != 0) {
 			/*
 			 * If we've got a locking error, then the vnode
 			 * got reclaimed because of a force unmount.
 			 * We never enter doomed vnodes into the name cache.
 			 */
 			*vpp = NULL;
 			return (error);
 		}
 
 		if ((cnp->cn_flags & ISDOTDOT) == 0)
 			break;
 
 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
 			vput(ZTOV(zp));
 			*vpp = NULL;
 			return (error);
 		}
 		if (zdp->z_sa_hdl == NULL) {
 			error = SET_ERROR(EIO);
 		} else {
 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 			    &parent, sizeof (parent));
 		}
 		if (error != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			vput(ZTOV(zp));
 			break;
 		}
 		if (zp->z_id == parent) {
 			zfs_exit(zfsvfs, FTAG);
 			break;
 		}
 		vput(ZTOV(zp));
 	}
 
 	if (error != 0)
 		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 #if __FreeBSD_version < 1400068
 				cnp->cn_flags |= SAVENAME;
 #endif
 				break;
 			}
 			zfs_fallthrough;
 		case DELETE:
 #if __FreeBSD_version < 1400068
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 #endif
 			break;
 		}
 	}
 
 #if	__FreeBSD_version > 1300124
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
 		 * handle races. In particular different callers may end up
 		 * with different vnodes and will try to add conflicting
 		 * entries to the namecache.
 		 *
 		 * While finding different result may be acceptable in face
 		 * of concurrent modification, adding conflicting entries
 		 * trips over an assert in the namecache.
 		 *
 		 * Ultimately let an entry through once everything settles.
 		 */
 		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 			cnp->cn_flags &= ~MAKEENTRY;
 		}
 	}
 #endif
 
 	/* Insert name into cache (as non-existent) if appropriate. */
 	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, NULL, cnp);
 
 	/* Insert name into cache if appropriate. */
 	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *		mnt_ns	- Unused on FreeBSD
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 int
 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
     znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 {
 	(void) excl, (void) mode, (void) flag;
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid = crgetuid(cr);
 	gid_t		gid = crgetgid(cr);
 	uint64_t	projid = ZFS_DEFAULT_PROJID;
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype;
 #ifdef DEBUG_VFS_LOCKS
 	vnode_t	*dvp = ZTOV(dzp);
 #endif
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	*zpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		goto out;
 	}
 
 	/*
 	 * We only support the creation of regular files in
 	 * extended attribute directories.
 	 */
 
 	if ((dzp->z_pflags & ZFS_XATTR) &&
 	    (vap->va_type != VREG)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids, NULL)) != 0)
 		goto out;
 
 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 		projid = zfs_inherit_projid(dzp);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	getnewvnode_reserve_();
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	error = zfs_link_create(dzp, name, zp, tx, ZNEW);
 	if (error != 0) {
 		/*
 		 * Since, we failed to add the directory entry for it,
 		 * delete the newly created dnode.
 		 */
 		zfs_znode_delete(zp, tx);
 		VOP_UNLOCK1(ZTOV(zp));
 		zrele(zp);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 		getnewvnode_drop_reserve();
 		goto out;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 	    vsecp, acl_ids.z_fuidp, vap);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 out:
 	VNCHECKREF(dvp);
 	if (error == 0) {
 		*zpp = zp;
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 static int
 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	xattr_obj;
 	uint64_t	obj = 0;
 	dmu_tx_t	*tx;
 	boolean_t	unlinked;
 	uint64_t	txtype;
 	int		error;
 
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zp = VTOZ(vp);
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zilog = zfsvfs->z_log;
 
 	xattr_obj = 0;
 	xzp = NULL;
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	obj = zp->z_id;
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 
 	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 		vp->v_vflag |= VV_NOSYNC;
 	}
 	/* XXX check changes to linux vnops */
 	txtype = TX_REMOVE;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
 
 	dmu_tx_commit(tx);
 out:
 
 	if (xzp)
 		vrele(ZTOV(xzp));
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 
 static int
 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
     struct componentname *cnp, int nameiop)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	int error;
 
 	cnp->cn_nameptr = __DECONST(char *, name);
 	cnp->cn_namelen = strlen(name);
 	cnp->cn_nameiop = nameiop;
 	cnp->cn_flags = ISLASTCN;
 #if __FreeBSD_version < 1400068
 	cnp->cn_flags |= SAVENAME;
 #endif
 	cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	cnp->cn_cred = kcred;
 #if __FreeBSD_version < 1400037
 	cnp->cn_thread = curthread;
 #endif
 
 	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
 		struct vop_lookup_args a;
 
 		a.a_gen.a_desc = &vop_lookup_desc;
 		a.a_dvp = ZTOV(dzp);
 		a.a_vpp = vpp;
 		a.a_cnp = cnp;
 		error = vfs_cache_lookup(&a);
 	} else {
 		error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
 		    B_FALSE);
 	}
 #ifdef ZFS_DEBUG
 	if (error) {
 		printf("got error %d on name %s on op %d\n", error, name,
 		    nameiop);
 		kdb_backtrace();
 	}
 #endif
 	return (error);
 }
 
 int
 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
 {
 	vnode_t *vp;
 	int error;
 	struct componentname cn;
 
 	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
 		return (error);
 
 	error = zfs_remove_(ZTOV(dzp), vp, name, cr);
 	vput(vp);
 	return (error);
 }
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *		mnt_ns	- Unused on FreeBSD
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 int
 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 {
 	(void) flags, (void) vsecp;
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid = crgetuid(cr);
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT3U(vap->va_type, ==, VDIR);
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    NULL, &acl_ids, NULL)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 	*zpp = NULL;
 
 	if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	ASSERT3P(zp, ==, NULL);
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
 	    mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	getnewvnode_reserve_();
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		VOP_UNLOCK1(ZTOV(zp));
 		zrele(zp);
 		goto out;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	*zpp = zp;
 
 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 out:
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 #if	__FreeBSD_version < 1300124
 static void
 cache_vop_rmdir(struct vnode *dvp, struct vnode *vp)
 {
 
 	cache_purge(dvp);
 	cache_purge(vp);
 }
 #endif
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 static int
 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zilog = zfsvfs->z_log;
 
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		zfs_log_remove(zilog, tx, txtype, dzp, name,
 		    ZFS_NO_OBJECT, B_FALSE);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (zfsvfs->z_use_namecache)
 		cache_vop_rmdir(dvp, vp);
 out:
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 int
 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
 {
 	struct componentname cn;
 	vnode_t *vp;
 	int error;
 
 	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
 		return (error);
 
 	error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
 	vput(vp);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *		ncookies- number of entries in cookies
  *		cookies	- offsets to directory entries
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 static int
 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
     int *ncookies, cookie_t **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		ncooks;
 	cookie_t	*cooks = NULL;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = zp->z_unlinked) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = zfs_uio_offset(uio);
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = GET_UIO_STRUCT(uio)->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
 		    sizeof (((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if ((error = zap_cursor_retrieve(&zc, &zap))) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 		}
 
 		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		/*
 		 * Add normal entry:
 		 */
 		odp->d_ino = objnum;
 		odp->d_reclen = reclen;
 		odp->d_namlen = strlen(zap.za_name);
 		/* NOTE: d_off is the offset for the *next* entry. */
 		next = &odp->d_off;
 		strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
 		odp->d_type = type;
 		dirent_terminate(odp);
 		odp = (dirent64_t *)((intptr_t)odp + reclen);
 
 		outcount += reclen;
 
 		ASSERT3S(outcount, <=, bufsize);
 
 		if (prefetch)
 			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		/* Fill the offset right after advancing the cursor. */
 		if (next != NULL)
 			*next = offset;
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		zfs_uio_resid(uio) -= outcount;
 	} else if ((error =
 	    zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = zfs_uio_offset(uio);
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	zfs_uio_setoffset(uio, offset);
 	zfs_exit(zfsvfs, FTAG);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr, NULL))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 	vn_fsid(vp, vap);
 	vap->va_nodeid = zp->z_id;
 	vap->va_nlink = zp->z_links;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
 	    zp->z_links < ZFS_LINK_MAX)
 		vap->va_nlink++;
 	vap->va_size = zp->z_size;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 	else
 		vap->va_rdev = 0;
 	vap->va_gen = zp->z_gen;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 			xoap->xoa_projinherit =
 			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
 			XVA_SET_RTN(xvap, XAT_PROJINHERIT);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 			xoap->xoa_projid = zp->z_projid;
 			XVA_SET_RTN(xvap, XAT_PROJID);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	zp	- znode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		mnt_ns	- Unused on FreeBSD
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 int
 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 {
 	vnode_t		*vp = ZTOV(zp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	uint64_t	projid = ZFS_INVALID_PROJID;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (err);
 
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
 	 */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 	if (xoap != NULL && (mask & AT_XVATTR)) {
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOVERFLOW));
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 			if (!dmu_objset_projectquota_enabled(os) ||
 			    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(EOPNOTSUPP));
 			}
 
 			projid = xoap->xoa_projid;
 			if (unlikely(projid == ZFS_INVALID_PROJID)) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
 				projid = ZFS_INVALID_PROJID;
 			else
 				need_policy = TRUE;
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
 		    (xoap->xoa_projinherit !=
 		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
 		    (!dmu_objset_projectquota_enabled(os) ||
 		    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOPNOTSUPP));
 		}
 	}
 
 	attrzp = NULL;
 	aclp = NULL;
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr, mnt_ns);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr, mnt_ns) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 			if (xoap->xoa_projinherit !=
 			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
 				XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
 		    mnt_ns) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				zfs_exit(zfsvfs, FTAG);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err == 0) {
 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
 				if (err != 0)
 					vrele(ZTOV(attrzp));
 			}
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
 			    new_uid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 			    new_gid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (projid != ZFS_INVALID_PROJID &&
 		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
 			if (attrzp)
 				vput(ZTOV(attrzp));
 			err = SET_ERROR(EDQUOT);
 			goto out2;
 		}
 	}
 	tx = dmu_tx_create(os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
 			goto out;
 
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if (((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
 		    (projid != ZFS_INVALID_PROJID &&
 		    !(zp->z_pflags & ZFS_PROJID)))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
 		/*
 		 * For the existed object that is upgraded from old system,
 		 * its on-disk layout has no slot for the project ID attribute.
 		 * But quota accounting logic needs to access related slots by
 		 * offset directly. So we need to adjust old objects' layout
 		 * to make the project ID to some unified and fixed offset.
 		 */
 		if (attrzp)
 			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
 		if (err == 0)
 			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 
 		if (unlikely(err == EEXIST))
 			err = 0;
 		else if (err != 0)
 			goto out;
 		else
 			projid = ZFS_INVALID_PROJID;
 	}
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 		if (projid != ZFS_INVALID_PROJID) {
 			attrzp->z_projid = projid;
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
 			    sizeof (attrzp->z_projid));
 		}
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT0(err);
 		if (attrzp) {
 			vn_seqc_write_begin(ZTOV(attrzp));
 			err = zfs_acl_chown_setattr(attrzp);
 			vn_seqc_write_end(ZTOV(attrzp));
 			ASSERT0(err);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3P(aclp, !=, NULL);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	if (projid != ZFS_INVALID_PROJID) {
 		zp->z_projid = projid;
 		SA_ADD_BULK_ATTR(bulk, count,
 		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 		    sizeof (zp->z_projid));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime);
 		}
 	}
 
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 			xoap->xoa_createtime = vap->va_birthtime;
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
 			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT3S(vp->v_type, ==, VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT0(err2);
 	}
 
 	if (attrzp)
 		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 	}
 
 out2:
 	if (os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
 }
 
 /*
  * Look up the directory entries corresponding to the source and target
  * directory/name pairs.
  */
 static int
 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
     znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
     znode_t **tzpp)
 {
 	zfsvfs_t *zfsvfs;
 	znode_t *szp, *tzp;
 	int error;
 
 	/*
 	 * Before using sdzp and tdzp we must ensure that they are live.
 	 * As a porting legacy from illumos we have two things to worry
 	 * about.  One is typical for FreeBSD and it is that the vnode is
 	 * not reclaimed (doomed).  The other is that the znode is live.
 	 * The current code can invalidate the znode without acquiring the
 	 * corresponding vnode lock if the object represented by the znode
 	 * and vnode is no longer valid after a rollback or receive operation.
 	 * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
 	 * that protects the znodes from the invalidation.
 	 */
 	zfsvfs = sdzp->z_zfsvfs;
 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
 	if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
 		return (error);
 	if ((error = zfs_verify_zp(tdzp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Re-resolve svp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
 	if (error != 0) {
 		/* Source entry invalid or not there. */
 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	*szpp = szp;
 
 	/*
 	 * Re-resolve tvp, if it disappeared we just carry on.
 	 */
 	error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
 	if (error != 0) {
 		vrele(ZTOV(szp));
 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	*tzpp = tzp;
 out:
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
  * fail to acquire any lock in the path we will drop all held locks,
  * acquire the new lock in a blocking fashion, and then release it and
  * restart the rename.  This acquire/release step ensures that we do not
  * spin on a lock waiting for release.  On error release all vnode locks
  * and decrement references the way tmpfs_rename() would do.
  */
 static int
 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
     struct vnode *tdvp, struct vnode **tvpp,
     const struct componentname *scnp, const struct componentname *tcnp)
 {
 	struct vnode	*nvp, *svp, *tvp;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	int		error;
 
 	VOP_UNLOCK1(tdvp);
 	if (*tvpp != NULL && *tvpp != tdvp)
 		VOP_UNLOCK1(*tvpp);
 
 relock:
 	error = vn_lock(sdvp, LK_EXCLUSIVE);
 	if (error)
 		goto out;
 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK1(sdvp);
 		if (error != EBUSY)
 			goto out;
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto out;
 		VOP_UNLOCK1(tdvp);
 		goto relock;
 	}
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 
 	error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
 	if (error != 0) {
 		VOP_UNLOCK1(sdvp);
 		VOP_UNLOCK1(tdvp);
 		goto out;
 	}
 	svp = ZTOV(szp);
 	tvp = tzp != NULL ? ZTOV(tzp) : NULL;
 
 	/*
 	 * Now try acquire locks on svp and tvp.
 	 */
 	nvp = svp;
 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK1(sdvp);
 		VOP_UNLOCK1(tdvp);
 		if (tvp != NULL)
 			vrele(tvp);
 		if (error != EBUSY) {
 			vrele(nvp);
 			goto out;
 		}
 		error = vn_lock(nvp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vrele(nvp);
 			goto out;
 		}
 		VOP_UNLOCK1(nvp);
 		/*
 		 * Concurrent rename race.
 		 * XXX ?
 		 */
 		if (nvp == tdvp) {
 			vrele(nvp);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		vrele(*svpp);
 		*svpp = nvp;
 		goto relock;
 	}
 	vrele(*svpp);
 	*svpp = nvp;
 
 	if (*tvpp != NULL)
 		vrele(*tvpp);
 	*tvpp = NULL;
 	if (tvp != NULL) {
 		nvp = tvp;
 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 		if (error != 0) {
 			VOP_UNLOCK1(sdvp);
 			VOP_UNLOCK1(tdvp);
 			VOP_UNLOCK1(*svpp);
 			if (error != EBUSY) {
 				vrele(nvp);
 				goto out;
 			}
 			error = vn_lock(nvp, LK_EXCLUSIVE);
 			if (error != 0) {
 				vrele(nvp);
 				goto out;
 			}
 			vput(nvp);
 			goto relock;
 		}
 		*tvpp = nvp;
 	}
 
 	return (0);
 
 out:
 	return (error);
 }
 
 /*
  * Note that we must use VRELE_ASYNC in this function as it walks
  * up the directory tree and vrele may need to acquire an exclusive
  * lock if a last reference to a vnode is dropped.
  */
 static int
 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*zp, *zp1;
 	uint64_t	parent;
 	int		error;
 
 	zfsvfs = tdzp->z_zfsvfs;
 	if (tdzp == szp)
 		return (SET_ERROR(EINVAL));
 	if (tdzp == sdzp)
 		return (0);
 	if (tdzp->z_id == zfsvfs->z_root)
 		return (0);
 	zp = tdzp;
 	for (;;) {
 		ASSERT(!zp->z_unlinked);
 		if ((error = sa_lookup(zp->z_sa_hdl,
 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 			break;
 
 		if (parent == szp->z_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		if (parent == zfsvfs->z_root)
 			break;
 		if (parent == sdzp->z_id)
 			break;
 
 		error = zfs_zget(zfsvfs, parent, &zp1);
 		if (error != 0)
 			break;
 
 		if (zp != tdzp)
 			VN_RELE_ASYNC(ZTOV(zp),
 			    dsl_pool_zrele_taskq(
 			    dmu_objset_pool(zfsvfs->z_os)));
 		zp = zp1;
 	}
 
 	if (error == ENOTDIR)
 		panic("checkpath: .. not a directory\n");
 	if (zp != tdzp)
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 	return (error);
 }
 
 #if	__FreeBSD_version < 1300124
 static void
 cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp,
     struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp)
 {
 
 	cache_purge(fvp);
 	if (tvp != NULL)
 		cache_purge(tvp);
 	cache_purge_negative(tdvp);
 }
 #endif
 
 static int
 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr);
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		scnp	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tcnp	- New entry name.
  *		cr	- credentials of caller.
  *	INOUT:	svpp	- Source file
  *		tvpp	- Target file, may point to NULL initially
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 static int
 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	int	error;
 
 	ASSERT_VOP_ELOCKED(tdvp, __func__);
 	if (*tvpp != NULL)
 		ASSERT_VOP_ELOCKED(*tvpp, __func__);
 
 	/* Reject renames across filesystems. */
 	if ((*svpp)->v_mount != tdvp->v_mount ||
 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	if (zfsctl_is_node(tdvp)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
 	if (error != 0) {
 		/* no vnodes are locked in the case of error here */
 		return (error);
 	}
 
 	error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr);
 	VOP_UNLOCK1(sdvp);
 	VOP_UNLOCK1(*svpp);
 out:
 	if (*tvpp != NULL)
 		VOP_UNLOCK1(*tvpp);
 	if (tdvp != *tvpp)
 		VOP_UNLOCK1(tdvp);
 
 	return (error);
 }
 
 static int
 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs;
 	zilog_t		*zilog;
 	znode_t		*tdzp, *sdzp, *tzp, *szp;
 	const char	*snm = scnp->cn_nameptr;
 	const char	*tnm = tcnp->cn_nameptr;
 	int		error;
 
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 	zfsvfs = tdzp->z_zfsvfs;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 		return (error);
 	if ((error = zfs_verify_zp(sdzp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		error = SET_ERROR(EILSEQ);
 		goto out;
 	}
 
 	/* If source and target are the same file, there is nothing to do. */
 	if ((*svpp) == (*tvpp)) {
 		error = 0;
 		goto out;
 	}
 
 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
 	    (*tvpp)->v_mountedhere != NULL)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	szp = VTOZ(*svpp);
 	if ((error = zfs_verify_zp(szp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
 	if (tzp != NULL) {
 		if ((error = zfs_verify_zp(tzp)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow renames into our tree when the project
 	 * IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
 		goto out;
 
 	if ((*svpp)->v_type == VDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
 		    sdzp == szp ||
 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if ((error = zfs_rename_check(szp, sdzp, tdzp)))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if ((*svpp)->v_type == VDIR) {
 			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto out;
 			} else {
 				cache_purge(tdvp);
 				if (sdvp != tdvp)
 					cache_purge(sdvp);
 			}
 		} else {
 			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto out;
 			}
 		}
 	}
 
 	vn_seqc_write_begin(*svpp);
 	vn_seqc_write_begin(sdvp);
 	if (*tvpp != NULL)
 		vn_seqc_write_begin(*tvpp);
 	if (tdvp != *tvpp)
 		vn_seqc_write_begin(tdvp);
 
 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto out_seq;
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
 			    NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 				    snm, tdzp, tnm, szp);
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL));
 			}
 		}
 		if (error == 0) {
 			cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
 		}
 	}
 
 	dmu_tx_commit(tx);
 
 out_seq:
 	vn_seqc_write_end(*svpp);
 	vn_seqc_write_end(sdvp);
 	if (*tvpp != NULL)
 		vn_seqc_write_end(*tvpp);
 	if (tdvp != *tvpp)
 		vn_seqc_write_end(tdvp);
 
 out:
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 int
 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
 {
 	struct componentname scn, tcn;
 	vnode_t *sdvp, *tdvp;
 	vnode_t *svp, *tvp;
 	int error;
 	svp = tvp = NULL;
 
 	if (rflags != 0 || wo_vap != NULL)
 		return (SET_ERROR(EINVAL));
 
 	sdvp = ZTOV(sdzp);
 	tdvp = ZTOV(tdzp);
 	error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
 	if (sdzp->z_zfsvfs->z_replay == B_FALSE)
 		VOP_UNLOCK1(sdvp);
 	if (error != 0)
 		goto fail;
 	VOP_UNLOCK1(svp);
 
 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
 	error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
 	if (error == EJUSTRETURN)
 		tvp = NULL;
 	else if (error != 0) {
 		VOP_UNLOCK1(tdvp);
 		goto fail;
 	}
 
 	error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr);
 fail:
 	if (svp != NULL)
 		vrele(svp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		mnt_ns	- Unused on FreeBSD
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 int
 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
     const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
 {
 	(void) flags;
 	znode_t		*zp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 
 	ASSERT3S(vap->va_type, ==, VLNK);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (len > MAXPATHLEN) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids, NULL)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids,
 	    0 /* projid */)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	getnewvnode_reserve_();
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datasets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    __DECONST(void *, link), len, tx);
 	else
 		zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	error = zfs_link_create(dzp, name, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		VOP_UNLOCK1(ZTOV(zp));
 		zrele(zp);
 	} else {
 		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	}
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (error == 0) {
 		*zpp = zp;
 
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 			zil_commit(zilog, 0);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 static int
 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	(void) cr, (void) ct;
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 int
 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
     int flags)
 {
 	(void) flags;
 	znode_t		*tzp;
 	zfsvfs_t	*zfsvfs = tdzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (ZTOV(szp)->v_type == VDIR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_verify_zp(szp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow hard link creation in our tree when the
 	 * project IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (szp->z_pflags & (ZFS_APPENDONLY |
 	    ZFS_IMMUTABLE | ZFS_READONLY)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, tdzp);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	error = zfs_link_create(tdzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (error == 0) {
 		vnevent_link(ZTOV(szp), ct);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	ip	- inode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	ip - ctime|mtime updated
  */
 int
 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr)
 {
 	(void) offset;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	uint64_t	off, len;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (cmd != F_FREESP) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	if (bfp->l_len < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Permissions aren't checked on Solaris because on this OS
 	 * zfs_space() can only be called with an opened file handle.
 	 * On Linux we can get here through truncate_range() which
 	 * operates directly on inodes, so we need to check access rights.
 	 */
 	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 static void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	(void) cr, (void) ct;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			dmu_tx_commit(tx);
 		}
 	}
 	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 }
 
 
 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
 	"struct zfid_short bigger than struct fid");
 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
 	"struct zfid_long bigger than struct fid");
 
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	(void) ct;
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	fidp->fid_len = size;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp;
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 	case _PC_ACL_EXTENDED:
 #if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
 		*valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
 		zfs_exit(zfsvfs, FTAG);
 #else
 		*valp = 0;
 #endif
 		return (0);
 
 	case _PC_ACL_NFS4:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
 		*valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zfs_locked_range_t *lr;
 	vm_object_t object;
 	off_t start, end, obj_size;
 	uint_t blksz;
 	int pgsin_b, pgsin_a;
 	int error;
 
 	if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
 		return (zfs_vm_pagerret_error);
 
 	start = IDX_TO_OFF(ma[0]->pindex);
 	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
 
 	/*
 	 * Lock a range covering all required and optional pages.
 	 * Note that we need to handle the case of the block size growing.
 	 */
 	for (;;) {
 		blksz = zp->z_blksz;
 		lr = zfs_rangelock_tryenter(&zp->z_rangelock,
 		    rounddown(start, blksz),
 		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
 		if (lr == NULL) {
 			if (rahead != NULL) {
 				*rahead = 0;
 				rahead = NULL;
 			}
 			if (rbehind != NULL) {
 				*rbehind = 0;
 				rbehind = NULL;
 			}
 			break;
 		}
 		if (blksz == zp->z_blksz)
 			break;
 		zfs_rangelock_exit(lr);
 	}
 
 	object = ma[0]->object;
 	zfs_vmobject_wlock(object);
 	obj_size = object->un_pager.vnp.vnp_size;
 	zfs_vmobject_wunlock(object);
 	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
 		if (lr != NULL)
 			zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	pgsin_b = 0;
 	if (rbehind != NULL) {
 		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
 		pgsin_b = MIN(*rbehind, pgsin_b);
 	}
 
 	pgsin_a = 0;
 	if (rahead != NULL) {
 		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
 		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
 			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
 		pgsin_a = MIN(*rahead, pgsin_a);
 	}
 
 	/*
 	 * NB: we need to pass the exact byte size of the data that we expect
 	 * to read after accounting for the file size.  This is required because
 	 * ZFS will panic if we request DMU to read beyond the end of the last
 	 * allocated block.
 	 */
 	error = dmu_read_pages(zfsvfs->z_os, zp->z_id, ma, count, &pgsin_b,
 	    &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE));
 
 	if (lr != NULL)
 		zfs_rangelock_exit(lr);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	if (error != 0)
 		return (zfs_vm_pagerret_error);
 
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	return (zfs_vm_pagerret_ok);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int *a_rbehind;
 	int *a_rahead;
 };
 #endif
 
 static int
 zfs_freebsd_getpages(struct vop_getpages_args *ap)
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead));
 }
 
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zfs_locked_range_t		*lr;
 	dmu_tx_t	*tx;
 	struct sf_buf	*sf;
 	vm_object_t	object;
 	vm_page_t	m;
 	caddr_t		va;
 	size_t		tocopy;
 	size_t		lo_len;
 	vm_ooffset_t	lo_off;
 	vm_ooffset_t	off;
 	uint_t		blksz;
 	int		ncount;
 	int		pcount;
 	int		err;
 	int		i;
 
 	object = vp->v_object;
 	KASSERT(ma[0]->object == object, ("mismatching object"));
 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
 
 	pcount = btoc(len);
 	ncount = pcount;
 	for (i = 0; i < pcount; i++)
 		rtvals[i] = zfs_vm_pagerret_error;
 
 	if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
 		return (zfs_vm_pagerret_error);
 
 	off = IDX_TO_OFF(ma[0]->pindex);
 	blksz = zp->z_blksz;
 	lo_off = rounddown(off, blksz);
 	lo_len = roundup(len + (off - lo_off), blksz);
 	lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
 
 	zfs_vmobject_wlock(object);
 	if (len + off > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > off) {
 			int pgoff;
 
 			len = object->un_pager.vnp.vnp_size - off;
 			ncount = btoc(len);
 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 				    ("zfs_putpages: page %p is not read-only",
 				    m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			len = 0;
 			ncount = 0;
 		}
 		if (ncount < pcount) {
 			for (i = ncount; i < pcount; i++) {
 				rtvals[i] = zfs_vm_pagerret_bad;
 			}
 		}
 	}
 	zfs_vmobject_wunlock(object);
 
 	if (ncount == 0)
 		goto out;
 
 	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
 	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
 	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
 	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 	    zp->z_projid))) {
 		goto out;
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
 		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
 			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
 			va = zfs_map_page(ma[i], &sf);
 			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
 		/*
 		 * XXX we should be passing a callback to undirty
 		 * but that would make the locking messier
 		 */
 		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off,
 		    len, 0, NULL, NULL);
 
 		zfs_vmobject_wlock(object);
 		for (i = 0; i < ncount; i++) {
 			rtvals[i] = zfs_vm_pagerret_ok;
 			vm_page_undirty(ma[i]);
 		}
 		zfs_vmobject_wunlock(object);
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	zfs_rangelock_exit(lr);
 	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zfsvfs->z_log, zp->z_id);
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (rtvals[0]);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_putpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int a_sync;
 	int *a_rtvals;
 };
 #endif
 
 static int
 zfs_freebsd_putpages(struct vop_putpages_args *ap)
 {
 
 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
 	    ap->a_rtvals));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_bmap_args {
 	struct vnode *a_vp;
 	daddr_t  a_bn;
 	struct bufobj **a_bop;
 	daddr_t *a_bnp;
 	int *a_runp;
 	int *a_runb;
 };
 #endif
 
 static int
 zfs_freebsd_bmap(struct vop_bmap_args *ap)
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_open_args {
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_open(struct vop_open_args *ap)
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_close_args {
 	struct vnode *a_vp;
 	int  a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_close(struct vop_close_args *ap)
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_ioctl_args {
 	struct vnode *a_vp;
 	ulong_t a_command;
 	caddr_t a_data;
 	int a_fflag;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL));
 }
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= O_APPEND;
 	if (ioflags & IO_NDELAY)
 		flags |= O_NONBLOCK;
 	if (ioflags & IO_SYNC)
 		flags |= O_SYNC;
 
 	return (flags);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_read_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_read(struct vop_read_args *ap)
 {
 	zfs_uio_t uio;
 	zfs_uio_init(&uio, ap->a_uio);
 	return (zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
 	    ap->a_cred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_write_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_write(struct vop_write_args *ap)
 {
 	zfs_uio_t uio;
 	zfs_uio_init(&uio, ap->a_uio);
 	return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
 	    ap->a_cred));
 }
 
 #if __FreeBSD_version >= 1300102
 /*
  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
  * the comment above cache_fplookup for details.
  */
 static int
 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
 {
 	vnode_t *vp;
 	znode_t *zp;
 	uint64_t pflags;
 
 	vp = v->a_vp;
 	zp = VTOZ_SMR(vp);
 	if (__predict_false(zp == NULL))
 		return (EAGAIN);
 	pflags = atomic_load_64(&zp->z_pflags);
 	if (pflags & ZFS_AV_QUARANTINED)
 		return (EAGAIN);
 	if (pflags & ZFS_XATTR)
 		return (EAGAIN);
 	if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
 		return (EAGAIN);
 	return (0);
 }
 #endif
 
 #if __FreeBSD_version >= 1300139
 static int
 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
 {
 	vnode_t *vp;
 	znode_t *zp;
 	char *target;
 
 	vp = v->a_vp;
 	zp = VTOZ_SMR(vp);
 	if (__predict_false(zp == NULL)) {
 		return (EAGAIN);
 	}
 
 	target = atomic_load_consume_ptr(&zp->z_cached_symlink);
 	if (target == NULL) {
 		return (EAGAIN);
 	}
 	return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_access_args {
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_access(struct vop_access_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 
 	if (ap->a_accmode == VEXEC) {
 		if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
 			return (0);
 	}
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0)
 		error = zfs_access(zp, accmode, 0, ap->a_cred);
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 #if __FreeBSD_version >= 1300105
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred);
 #else
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred, NULL);
 #endif
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_lookup_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, 0, cached));
 }
 
 static int
 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
 {
 
 	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_lookup_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	if (zfsvfs->z_use_namecache)
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap, B_FALSE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 static int
 zfs_freebsd_create(struct vop_create_args *ap)
 {
 	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	int rc, mode;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	*ap->a_vpp = NULL;
 
 	rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, 0, mode,
 	    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
 	if (rc == 0)
 		*ap->a_vpp = ZTOV(zp);
 	if (zfsvfs->z_use_namecache &&
 	    rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 
 	return (rc);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_remove(struct vop_remove_args *ap)
 {
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 #endif
 
 	return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
 	    ap->a_cnp->cn_cred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 static int
 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
 {
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	int rc;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 #endif
 
 	vattr_init_mask(vap);
 	*ap->a_vpp = NULL;
 
 	rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
 	    ap->a_cnp->cn_cred, 0, NULL, NULL);
 
 	if (rc == 0)
 		*ap->a_vpp = ZTOV(zp);
 	return (rc);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_readdir_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	cookie_t **a_cookies;
 };
 #endif
 
 static int
 zfs_freebsd_readdir(struct vop_readdir_args *ap)
 {
 	zfs_uio_t uio;
 	zfs_uio_init(&uio, ap->a_uio);
 	return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_fsync_args {
 	struct vnode *a_vp;
 	int a_waitfor;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_fsync(struct vop_fsync_args *ap)
 {
 
 	return (zfs_fsync(VTOZ(ap->a_vp), 0, ap->a_td->td_ucred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_getattr(struct vop_getattr_args *ap)
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	ulong_t fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	XVA_SET_REQ(&xvap, XAT_READONLY);
 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
 	XVA_SET_REQ(&xvap, XAT_REPARSE);
 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
 	XVA_SET_REQ(&xvap, XAT_SPARSE);
 
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
 	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
 	    xvap.xva_xoptattrs.xoa_readonly);
 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
 	    xvap.xva_xoptattrs.xoa_system);
 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
 	    xvap.xva_xoptattrs.xoa_hidden);
 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
 	    xvap.xva_xoptattrs.xoa_reparse);
 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
 	    xvap.xva_xoptattrs.xoa_offline);
 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
 	    xvap.xva_xoptattrs.xoa_sparse);
 
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_setattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_setattr(struct vop_setattr_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	ulong_t fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		/*
 		 * XXX KDM
 		 * We need to figure out whether it makes sense to allow
 		 * UF_REPARSE through, since we don't really have other
 		 * facilities to handle reparse points and zfs_setattr()
 		 * doesn't currently allow setting that attribute anyway.
 		 */
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
 		    UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
 		    UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on
 			 * objects they have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred,
 			    curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY |
 			    ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
 		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
 		    xvap.xva_xoptattrs.xoa_readonly);
 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
 		    xvap.xva_xoptattrs.xoa_system);
 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
 		    xvap.xva_xoptattrs.xoa_reparse);
 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
 		    xvap.xva_xoptattrs.xoa_offline);
 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
 		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	if (vap->va_birthtime.tv_sec != VNOVAL) {
 		xvap.xva_vattr.va_mask |= AT_XVATTR;
 		XVA_SET_REQ(&xvap, XAT_CREATETIME);
 	}
 	return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_rename_args {
 	struct vnode *a_fdvp;
 	struct vnode *a_fvp;
 	struct componentname *a_fcnp;
 	struct vnode *a_tdvp;
 	struct vnode *a_tvp;
 	struct componentname *a_tcnp;
 };
 #endif
 
 static int
 zfs_freebsd_rename(struct vop_rename_args *ap)
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 #endif
 
 	error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
 	    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_symlink_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 	char *a_target;
 };
 #endif
 
 static int
 zfs_freebsd_symlink(struct vop_symlink_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 #if __FreeBSD_version >= 1300139
 	char *symlink;
 	size_t symlink_len;
 #endif
 	int rc;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 	*ap->a_vpp = NULL;
 
 	rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
 	    ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
 	if (rc == 0) {
 		*ap->a_vpp = ZTOV(zp);
 		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 #if __FreeBSD_version >= 1300139
 		MPASS(zp->z_cached_symlink == NULL);
 		symlink_len = strlen(ap->a_target);
 		symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
 		if (symlink != NULL) {
 			memcpy(symlink, ap->a_target, symlink_len);
 			symlink[symlink_len] = '\0';
 			atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
 			    (uintptr_t)symlink);
 		}
 #endif
 	}
 	return (rc);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_readlink_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_readlink(struct vop_readlink_args *ap)
 {
 	zfs_uio_t uio;
 	int error;
 #if __FreeBSD_version >= 1300139
 	znode_t	*zp = VTOZ(ap->a_vp);
 	char *symlink, *base;
 	size_t symlink_len;
 	bool trycache;
 #endif
 
 	zfs_uio_init(&uio, ap->a_uio);
 #if __FreeBSD_version >= 1300139
 	trycache = false;
 	if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
 	    zfs_uio_iovcnt(&uio) == 1) {
 		base = zfs_uio_iovbase(&uio, 0);
 		symlink_len = zfs_uio_iovlen(&uio, 0);
 		trycache = true;
 	}
 #endif
 	error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
 #if __FreeBSD_version >= 1300139
 	if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
 	    error != 0 || !trycache) {
 		return (error);
 	}
 	symlink_len -= zfs_uio_resid(&uio);
 	symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
 	if (symlink != NULL) {
 		memcpy(symlink, base, symlink_len);
 		symlink[symlink_len] = '\0';
 		if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
 		    (uintptr_t)NULL, (uintptr_t)symlink)) {
 			cache_symlink_free(symlink, symlink_len + 1);
 		}
 	}
 #endif
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_link_args {
 	struct vnode *a_tdvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_link(struct vop_link_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *vp = ap->a_vp;
 	vnode_t *tdvp = ap->a_tdvp;
 
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	return (zfs_link(VTOZ(tdvp), VTOZ(vp),
 	    cnp->cn_nameptr, cnp->cn_cred, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_inactive(struct vop_inactive_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 
 #if __FreeBSD_version >= 1300123
 	zfs_inactive(vp, curthread->td_ucred, NULL);
 #else
 	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
 #endif
 	return (0);
 }
 
 #if __FreeBSD_version >= 1300042
 #ifndef _SYS_SYSPROTO_H_
 struct vop_need_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int need;
 
 	if (vn_need_pageq_flush(vp))
 		return (1);
 
 	if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
 		return (1);
 	need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
 	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 
 	return (need);
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_reclaim_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT3P(zp, !=, NULL);
 
 #if __FreeBSD_version < 1300042
 	/* Destroy the vm object and flush associated pages. */
 	vnode_destroy_vobject(vp);
 #endif
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_fid_args {
 	struct vnode *a_vp;
 	struct fid *a_fid;
 };
 #endif
 
 static int
 zfs_freebsd_fid(struct vop_fid_args *ap)
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_pathconf_args {
 	struct vnode *a_vp;
 	int a_name;
 	register_t *a_retval;
 } *ap;
 #endif
 
 static int
 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 {
 	ulong_t val;
 	int error;
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
 	    curthread->td_ucred, NULL);
 	if (error == 0) {
 		*ap->a_retval = val;
 		return (error);
 	}
 	if (error != EOPNOTSUPP)
 		return (error);
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 #if __FreeBSD_version >= 1400032
 	case _PC_DEALLOC_PRESENT:
 		*ap->a_retval = 1;
 		return (0);
 #endif
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
 			*ap->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 static int zfs_xattr_compat = 1;
 
 static int
 zfs_check_attrname(const char *name)
 {
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (SET_ERROR(EINVAL));
 	/* We don't allow attribute names that start with a namespace prefix. */
 	if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
 		return (SET_ERROR(EINVAL));
 	return (0);
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	XATTR_COMPAT	PREFIX
  *	system		*		freebsd:system:
  *	user		1		(none, can be used to access ZFS
  *					fsattr(5) attributes created on Solaris)
  *	user		0		user.
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size, boolean_t compat)
 {
 	const char *namespace, *prefix, *suffix;
 
 	memset(attrname, 0, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 		if (compat) {
 			/*
 			 * This is the default namespace by which we can access
 			 * all attributes created on Solaris.
 			 */
 			prefix = namespace = suffix = "";
 		} else {
 			/*
 			 * This is compatible with the user namespace encoding
 			 * on Linux prior to xattr_compat, but nothing
 			 * else.
 			 */
 			prefix = "";
 			namespace = "user";
 			suffix = ".";
 		}
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 	return (0);
 }
 
 static int
 zfs_ensure_xattr_cached(znode_t *zp)
 {
 	int error = 0;
 
 	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
 
 	if (zp->z_xattr_cached != NULL)
 		return (0);
 
 	if (rw_write_held(&zp->z_xattr_lock))
 		return (zfs_sa_get_xattr(zp));
 
 	if (!rw_tryupgrade(&zp->z_xattr_lock)) {
 		rw_exit(&zp->z_xattr_lock);
 		rw_enter(&zp->z_xattr_lock, RW_WRITER);
 	}
 	if (zp->z_xattr_cached == NULL)
 		error = zfs_sa_get_xattr(zp);
 	rw_downgrade(&zp->z_xattr_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
 {
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	flags = FREAD;
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 #else
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	vp = nd.ni_vp;
 	NDFREE_PNBUF(&nd);
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK1(vp);
 	vn_close(vp, flags, ap->a_cred, td);
 	return (error);
 }
 
 static int
 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	uchar_t *nv_value;
 	uint_t nv_size;
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
 	    &nv_value, &nv_size);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	if (ap->a_size != NULL)
 		*ap->a_size = nv_size;
 	else if (ap->a_uio != NULL)
 		error = uiomove(nv_value, nv_size, ap->a_uio);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 static int
 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrname[EXTATTR_MAXNAMELEN+1];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname), compat);
 	if (error != 0)
 		return (error);
 
 	error = ENOENT;
 	if (zfsvfs->z_use_sa && zp->z_is_sa)
 		error = zfs_getextattr_sa(ap, attrname);
 	if (error == ENOENT)
 		error = zfs_getextattr_dir(ap, attrname);
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	error = zfs_check_attrname(ap->a_name);
 	if (error != 0)
 		return (error);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	error = ENOENT;
 	rw_enter(&zp->z_xattr_lock, RW_READER);
 
 	error = zfs_getextattr_impl(ap, zfs_xattr_compat);
 	if ((error == ENOENT || error == ENOATTR) &&
 	    ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/*
 		 * Fall back to the alternate namespace format if we failed to
 		 * find a user xattr.
 		 */
 		error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
 	}
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	if (error == ENOENT)
 		error = SET_ERROR(ENOATTR);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
 {
 	struct nameidata nd;
 	vnode_t *xvp = NULL, *vp;
 	int error;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR, B_FALSE);
 	if (error != 0)
 		return (error);
 
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, ap->a_td);
 #else
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = namei(&nd);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	vp = nd.ni_vp;
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE_PNBUF(&nd);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 
 	return (error);
 }
 
 static int
 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	nvlist_t *nvl;
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	nvl = zp->z_xattr_cached;
 	error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
 	if (error != 0)
 		error = SET_ERROR(error);
 	else
 		error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
 	if (error != 0) {
 		zp->z_xattr_cached = NULL;
 		nvlist_free(nvl);
 	}
 	return (error);
 }
 
 static int
 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrname[EXTATTR_MAXNAMELEN+1];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname), compat);
 	if (error != 0)
 		return (error);
 
 	error = ENOENT;
 	if (zfsvfs->z_use_sa && zp->z_is_sa)
 		error = zfs_deleteextattr_sa(ap, attrname);
 	if (error == ENOENT)
 		error = zfs_deleteextattr_dir(ap, attrname);
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 static int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	error = zfs_check_attrname(ap->a_name);
 	if (error != 0)
 		return (error);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 
 	error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
 	if ((error == ENOENT || error == ENOATTR) &&
 	    ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/*
 		 * Fall back to the alternate namespace format if we failed to
 		 * find a user xattr.
 		 */
 		error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
 	}
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	if (error == ENOENT)
 		error = SET_ERROR(ENOATTR);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
 {
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
 #else
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
 	    NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	vp = nd.ni_vp;
 	NDFREE_PNBUF(&nd);
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK1(vp);
 	vn_close(vp, flags, ap->a_cred, td);
 	return (error);
 }
 
 static int
 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	nvlist_t *nvl;
 	size_t sa_size;
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	nvl = zp->z_xattr_cached;
 	size_t entry_size = ap->a_uio->uio_resid;
 	if (entry_size > DXATTR_MAX_ENTRY_SIZE)
 		return (SET_ERROR(EFBIG));
 	error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
 	if (error != 0)
 		return (SET_ERROR(error));
 	if (sa_size > DXATTR_MAX_SA_SIZE)
 		return (SET_ERROR(EFBIG));
 	uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
 	error = uiomove(buf, entry_size, ap->a_uio);
 	if (error != 0) {
 		error = SET_ERROR(error);
 	} else {
 		error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
 		if (error != 0)
 			error = SET_ERROR(error);
 	}
 	if (error == 0)
 		error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
 	kmem_free(buf, entry_size);
 	if (error != 0) {
 		zp->z_xattr_cached = NULL;
 		nvlist_free(nvl);
 	}
 	return (error);
 }
 
 static int
 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrname[EXTATTR_MAXNAMELEN+1];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname), compat);
 	if (error != 0)
 		return (error);
 
 	struct vop_deleteextattr_args vda = {
 		.a_vp = ap->a_vp,
 		.a_attrnamespace = ap->a_attrnamespace,
 		.a_name = ap->a_name,
 		.a_cred = ap->a_cred,
 		.a_td = ap->a_td,
 	};
 	error = ENOENT;
 	if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
 		error = zfs_setextattr_sa(ap, attrname);
 		if (error == 0) {
 			/*
 			 * Successfully put into SA, we need to clear the one
 			 * in dir if present.
 			 */
 			zfs_deleteextattr_dir(&vda, attrname);
 		}
 	}
 	if (error != 0) {
 		error = zfs_setextattr_dir(ap, attrname);
 		if (error == 0 && zp->z_is_sa) {
 			/*
 			 * Successfully put into dir, we need to clear the one
 			 * in SA if present.
 			 */
 			zfs_deleteextattr_sa(&vda, attrname);
 		}
 	}
 	if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/*
 		 * Also clear all versions of the alternate compat name.
 		 */
 		zfs_deleteextattr_impl(&vda, !compat);
 	}
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	error = zfs_check_attrname(ap->a_name);
 	if (error != 0)
 		return (error);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 
 	error = zfs_setextattr_impl(ap, zfs_xattr_compat);
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
 {
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	uint8_t dirbuf[sizeof (struct dirent)];
 	struct iovec aiov;
 	struct uio auio;
 	vnode_t *xvp = NULL, *vp;
 	int error, eof;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR, B_FALSE);
 	if (error != 0) {
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 #else
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp);
 #endif
 	error = namei(&nd);
 	if (error != 0)
 		return (SET_ERROR(error));
 	vp = nd.ni_vp;
 	NDFREE_PNBUF(&nd);
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	size_t plen = strlen(attrprefix);
 
 	do {
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof (dirbuf);
 		auio.uio_resid = sizeof (dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		if (error != 0)
 			break;
 		int done = sizeof (dirbuf) - auio.uio_resid;
 		for (int pos = 0; pos < done; ) {
 			struct dirent *dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			else if (plen == 0 &&
 			    ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			uint8_t nlen = dp->d_namlen - plen;
 			if (ap->a_size != NULL) {
 				*ap->a_size += 1 + nlen;
 			} else if (ap->a_uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, ap->a_uio);
 				if (error == 0) {
 					char *namep = dp->d_name + plen;
 					error = uiomove(namep, nlen, ap->a_uio);
 				}
 				if (error != 0) {
 					error = SET_ERROR(error);
 					break;
 				}
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	return (error);
 }
 
 static int
 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	size_t plen = strlen(attrprefix);
 	nvpair_t *nvp = NULL;
 	while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
 		ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
 
 		const char *name = nvpair_name(nvp);
 		if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
 			continue;
 		else if (strncmp(name, attrprefix, plen) != 0)
 			continue;
 		uint8_t nlen = strlen(name) - plen;
 		if (ap->a_size != NULL) {
 			*ap->a_size += 1 + nlen;
 		} else if (ap->a_uio != NULL) {
 			/*
 			 * Format of extattr name entry is one byte for
 			 * length and the rest for name.
 			 */
 			error = uiomove(&nlen, 1, ap->a_uio);
 			if (error == 0) {
 				char *namep = __DECONST(char *, name) + plen;
 				error = uiomove(namep, nlen, ap->a_uio);
 			}
 			if (error != 0) {
 				error = SET_ERROR(error);
 				break;
 			}
 		}
 	}
 
 	return (error);
 }
 
 static int
 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrprefix[16];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof (attrprefix), compat);
 	if (error != 0)
 		return (error);
 
 	if (zfsvfs->z_use_sa && zp->z_is_sa)
 		error = zfs_listextattr_sa(ap, attrprefix);
 	if (error == 0)
 		error = zfs_listextattr_dir(ap, attrprefix);
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	rw_enter(&zp->z_xattr_lock, RW_READER);
 
 	error = zfs_listextattr_impl(ap, zfs_xattr_compat);
 	if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/* Also list user xattrs with the alternate format. */
 		error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
 	}
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getacl_args {
 	struct vnode *vp;
 	acl_type_t type;
 	struct acl *aclp;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_getacl(struct vop_getacl_args *ap)
 {
 	int		error;
 	vsecattr_t	vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
 	    &vsecattr, 0, ap->a_cred)))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
 	    vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_setacl_args {
 	struct vnode *vp;
 	acl_type_t type;
 	struct acl *aclp;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_setacl(struct vop_setacl_args *ap)
 {
 	int		error;
 	vsecattr_t vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp == NULL)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_aclcheck_args {
 	struct vnode *vp;
 	acl_type_t type;
 	struct acl *aclp;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 static int
 zfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	vnode_t *covered_vp;
 	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *zp = VTOZ(vp);
 	int ltype;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/*
 	 * If we are a snapshot mounted under .zfs, run the operation
 	 * on the covered vnode.
 	 */
 	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
 		char name[MAXNAMLEN + 1];
 		znode_t *dzp;
 		size_t len;
 
 		error = zfs_znode_parent_and_name(zp, &dzp, name);
 		if (error == 0) {
 			len = strlen(name);
 			if (*ap->a_buflen < len)
 				error = SET_ERROR(ENOMEM);
 		}
 		if (error == 0) {
 			*ap->a_buflen -= len;
 			memcpy(ap->a_buf + *ap->a_buflen, name, len);
 			*ap->a_vpp = ZTOV(dzp);
 		}
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zfs_exit(zfsvfs, FTAG);
 
 	covered_vp = vp->v_mount->mnt_vnodecovered;
 #if __FreeBSD_version >= 1300045
 	enum vgetstate vs = vget_prep(covered_vp);
 #else
 	vhold(covered_vp);
 #endif
 	ltype = VOP_ISLOCKED(vp);
 	VOP_UNLOCK1(vp);
 #if __FreeBSD_version >= 1300045
 	error = vget_finish(covered_vp, LK_SHARED, vs);
 #else
 	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
 #endif
 	if (error == 0) {
 #if __FreeBSD_version >= 1300123
 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
 		    ap->a_buflen);
 #else
 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
 		    ap->a_buf, ap->a_buflen);
 #endif
 		vput(covered_vp);
 	}
 	vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp))
 		error = SET_ERROR(ENOENT);
 	return (error);
 }
 
 #if __FreeBSD_version >= 1400032
 static int
 zfs_deallocate(struct vop_deallocate_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog;
 	off_t off, len, file_sz;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	zilog = zfsvfs->z_log;
 	off = *ap->a_offset;
 	len = *ap->a_len;
 	file_sz = zp->z_size;
 	if (off + len > file_sz)
 		len = file_sz - off;
 	/* Fast path for out-of-range request. */
 	if (len <= 0) {
 		*ap->a_len = 0;
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
 	if (error == 0) {
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
 		    (ap->a_ioflag & IO_SYNC) != 0)
 			zil_commit(zilog, zp->z_id);
 		*ap->a_offset = off + len;
 		*ap->a_len = 0;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 #endif
 
 #if __FreeBSD_version >= 1300039
 #ifndef _SYS_SYSPROTO_H_
 struct vop_copy_file_range_args {
 	struct vnode *a_invp;
 	off_t *a_inoffp;
 	struct vnode *a_outvp;
 	off_t *a_outoffp;
 	size_t *a_lenp;
 	unsigned int a_flags;
 	struct ucred *a_incred;
 	struct ucred *a_outcred;
 	struct thread *a_fsizetd;
 }
 #endif
 /*
  * TODO: FreeBSD will only call file system-specific copy_file_range() if both
  * files resides under the same mountpoint. In case of ZFS we want to be called
  * even is files are in different datasets (but on the same pools, but we need
  * to check that ourselves).
  */
 static int
 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 {
 	zfsvfs_t *outzfsvfs;
 	struct vnode *invp = ap->a_invp;
 	struct vnode *outvp = ap->a_outvp;
 	struct mount *mp;
-	struct uio io;
 	int error;
 	uint64_t len = *ap->a_lenp;
 
 	if (!zfs_bclone_enabled) {
 		mp = NULL;
 		goto bad_write_fallback;
 	}
 
 	/*
 	 * TODO: If offset/length is not aligned to recordsize, use
 	 * vn_generic_copy_file_range() on this fragment.
 	 * It would be better to do this after we lock the vnodes, but then we
 	 * need something else than vn_generic_copy_file_range().
 	 */
 
 	vn_start_write(outvp, &mp, V_WAIT);
 	if (__predict_true(mp == outvp->v_mount)) {
 		outzfsvfs = (zfsvfs_t *)mp->mnt_data;
 		if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
 		    SPA_FEATURE_BLOCK_CLONING)) {
 			goto bad_write_fallback;
 		}
 	}
 	if (invp == outvp) {
 		if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
 			goto bad_write_fallback;
 		}
 	} else {
 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
 	__FreeBSD_version >= 1400086
 		vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false,
 		    LK_EXCLUSIVE);
 #else
 		vn_lock_pair(invp, false, outvp, false);
 #endif
 		if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
 			goto bad_locked_fallback;
 		}
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
 	    outvp);
 	if (error != 0)
 		goto out_locked;
 #endif
 
-	io.uio_offset = *ap->a_outoffp;
-	io.uio_resid = *ap->a_lenp;
-	error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd);
-	if (error != 0)
-		goto out_locked;
-
 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
 	if (error == EXDEV || error == EAGAIN || error == EINVAL ||
 	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
 out_locked:
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);
 	if (mp != NULL)
 		vn_finished_write(mp);
 	return (error);
 
 bad_locked_fallback:
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);
 bad_write_fallback:
 	if (mp != NULL)
 		vn_finished_write(mp);
 	error = ENOSYS;
 	return (error);
 }
 #endif
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 #if __FreeBSD_version >= 1300042
 	.vop_need_inactive =	zfs_freebsd_need_inactive,
 #endif
 	.vop_reclaim =		zfs_freebsd_reclaim,
 #if __FreeBSD_version >= 1300102
 	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
 #endif
 #if __FreeBSD_version >= 1300139
 	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
 #endif
 	.vop_access =		zfs_freebsd_access,
 	.vop_allocate =		VOP_EINVAL,
 #if __FreeBSD_version >= 1400032
 	.vop_deallocate =	zfs_deallocate,
 #endif
 	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		(vop_mknod_t *)zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
 #if __FreeBSD_version >= 1300064
 	.vop_lock1 =		vop_lock,
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 #endif
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 #if __FreeBSD_version >= 1300039
 	.vop_copy_file_range =	zfs_freebsd_copy_file_range,
 #endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 #if __FreeBSD_version >= 1300102
 	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
 #endif
 #if __FreeBSD_version >= 1300139
 	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
 #endif
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 #if __FreeBSD_version >= 1300121
 	.vop_fplookup_vexec =	VOP_EAGAIN,
 #endif
 #if __FreeBSD_version >= 1300139
 	.vop_fplookup_symlink =	VOP_EAGAIN,
 #endif
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
 
 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
 	"Use legacy ZFS xattr naming for writing new user namespace xattrs");
diff --git a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
index f0f929d3ce90..2c0cdd9febf5 100644
--- a/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
+++ b/sys/contrib/openzfs/module/os/linux/spl/spl-proc.c
@@ -1,766 +1,807 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  Solaris Porting Layer (SPL) Proc Implementation.
  */
+/*
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
+ */
 
 #include <sys/systeminfo.h>
 #include <sys/kstat.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
 #include <sys/taskq.h>
 #include <sys/proc.h>
 #include <linux/ctype.h>
 #include <linux/kmod.h>
 #include <linux/seq_file.h>
 #include <linux/uaccess.h>
 #include <linux/version.h>
 #include "zfs_gitrev.h"
 
 #if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0)
 typedef struct ctl_table __no_const spl_ctl_table;
 #else
 typedef struct ctl_table spl_ctl_table;
 #endif
 
+#ifdef HAVE_PROC_HANDLER_CTL_TABLE_CONST
+#define	CONST_CTL_TABLE		const struct ctl_table
+#else
+#define	CONST_CTL_TABLE		struct ctl_table
+#endif
+
 static unsigned long table_min = 0;
 static unsigned long table_max = ~0;
 
 static struct ctl_table_header *spl_header = NULL;
 #ifndef HAVE_REGISTER_SYSCTL_TABLE
 static struct ctl_table_header *spl_kmem = NULL;
 static struct ctl_table_header *spl_kstat = NULL;
 #endif
 static struct proc_dir_entry *proc_spl = NULL;
 static struct proc_dir_entry *proc_spl_kmem = NULL;
 static struct proc_dir_entry *proc_spl_kmem_slab = NULL;
 static struct proc_dir_entry *proc_spl_taskq_all = NULL;
 static struct proc_dir_entry *proc_spl_taskq = NULL;
 struct proc_dir_entry *proc_spl_kstat = NULL;
 
 #ifdef DEBUG_KMEM
 static int
-proc_domemused(struct ctl_table *table, int write,
+proc_domemused(CONST_CTL_TABLE *table, int write,
     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
 	unsigned long val;
 	spl_ctl_table dummy = *table;
 
 	dummy.data = &val;
 	dummy.proc_handler = &proc_dointvec;
 	dummy.extra1 = &table_min;
 	dummy.extra2 = &table_max;
 
 	if (write) {
 		*ppos += *lenp;
 	} else {
 #ifdef HAVE_ATOMIC64_T
 		val = atomic64_read((atomic64_t *)table->data);
 #else
 		val = atomic_read((atomic_t *)table->data);
 #endif /* HAVE_ATOMIC64_T */
 		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
 	}
 
 	return (rc);
 }
 #endif /* DEBUG_KMEM */
 
 static int
-proc_doslab(struct ctl_table *table, int write,
+proc_doslab(CONST_CTL_TABLE *table, int write,
     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	int rc = 0;
 	unsigned long val = 0, mask;
 	spl_ctl_table dummy = *table;
 	spl_kmem_cache_t *skc = NULL;
 
 	dummy.data = &val;
 	dummy.proc_handler = &proc_dointvec;
 	dummy.extra1 = &table_min;
 	dummy.extra2 = &table_max;
 
 	if (write) {
 		*ppos += *lenp;
 	} else {
 		down_read(&spl_kmem_cache_sem);
 		mask = (unsigned long)table->data;
 
 		list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
 
 			/* Only use slabs of the correct kmem/vmem type */
 			if (!(skc->skc_flags & mask))
 				continue;
 
 			/* Sum the specified field for selected slabs */
 			switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) {
 			case KMC_TOTAL:
 				val += skc->skc_slab_size * skc->skc_slab_total;
 				break;
 			case KMC_ALLOC:
 				val += skc->skc_obj_size * skc->skc_obj_alloc;
 				break;
 			case KMC_MAX:
 				val += skc->skc_obj_size * skc->skc_obj_max;
 				break;
 			}
 		}
 
 		up_read(&spl_kmem_cache_sem);
 		rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos);
 	}
 
 	return (rc);
 }
 
 static int
-proc_dohostid(struct ctl_table *table, int write,
+proc_dohostid(CONST_CTL_TABLE *table, int write,
     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
 	char *end, str[32];
 	unsigned long hid;
 	spl_ctl_table dummy = *table;
 
 	dummy.data = str;
 	dummy.maxlen = sizeof (str) - 1;
 
 	if (!write)
 		snprintf(str, sizeof (str), "%lx",
 		    (unsigned long) zone_get_hostid(NULL));
 
 	/* always returns 0 */
 	proc_dostring(&dummy, write, buffer, lenp, ppos);
 
 	if (write) {
 		/*
 		 * We can't use proc_doulongvec_minmax() in the write
 		 * case here because hostid, while a hex value, has no
 		 * leading 0x, which confuses the helper function.
 		 */
 
 		hid = simple_strtoul(str, &end, 16);
 		if (str == end)
 			return (-EINVAL);
 		spl_hostid = hid;
 	}
 
 	return (0);
 }
 
 static void
 taskq_seq_show_headers(struct seq_file *f)
 {
 	seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n",
 	    "taskq", "act", "nthr", "spwn", "maxt", "pri",
 	    "mina", "maxa", "cura", "flags");
 }
 
 /* indices into the lheads array below */
 #define	LHEAD_PEND	0
 #define	LHEAD_PRIO	1
 #define	LHEAD_DELAY	2
 #define	LHEAD_WAIT	3
 #define	LHEAD_ACTIVE	4
 #define	LHEAD_SIZE	5
 
 static unsigned int spl_max_show_tasks = 512;
 /* CSTYLED */
 module_param(spl_max_show_tasks, uint, 0644);
 MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc");
 
 static int
 taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag)
 {
 	taskq_t *tq = p;
 	taskq_thread_t *tqt = NULL;
 	spl_wait_queue_entry_t *wq;
 	struct task_struct *tsk;
 	taskq_ent_t *tqe;
 	char name[100];
 	struct list_head *lheads[LHEAD_SIZE], *lh;
 	static char *list_names[LHEAD_SIZE] =
 	    {"pend", "prio", "delay", "wait", "active" };
 	int i, j, have_lheads = 0;
 	unsigned long wflags, flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags);
 
 	/* get the various lists and check whether they're empty */
 	lheads[LHEAD_PEND] = &tq->tq_pend_list;
 	lheads[LHEAD_PRIO] = &tq->tq_prio_list;
 	lheads[LHEAD_DELAY] = &tq->tq_delay_list;
 #ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
 	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head;
 #else
 	lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list;
 #endif
 	lheads[LHEAD_ACTIVE] = &tq->tq_active_list;
 
 	for (i = 0; i < LHEAD_SIZE; ++i) {
 		if (list_empty(lheads[i]))
 			lheads[i] = NULL;
 		else
 			++have_lheads;
 	}
 
 	/* early return in non-"all" mode if lists are all empty */
 	if (!allflag && !have_lheads) {
 		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		return (0);
 	}
 
 	/* unlock the waitq quickly */
 	if (!lheads[LHEAD_WAIT])
 		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
 
 	/* show the base taskq contents */
 	snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance);
 	seq_printf(f, "%-25s ", name);
 	seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n",
 	    tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn,
 	    tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc,
 	    tq->tq_nalloc, tq->tq_flags);
 
 	/* show the active list */
 	if (lheads[LHEAD_ACTIVE]) {
 		j = 0;
 		list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) {
 			if (j == 0)
 				seq_printf(f, "\t%s:",
 				    list_names[LHEAD_ACTIVE]);
 			else if (j == 2) {
 				seq_printf(f, "\n\t       ");
 				j = 0;
 			}
 			seq_printf(f, " [%d]%pf(%ps)",
 			    tqt->tqt_thread->pid,
 			    tqt->tqt_task->tqent_func,
 			    tqt->tqt_task->tqent_arg);
 			++j;
 		}
 		seq_printf(f, "\n");
 	}
 
 	for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i)
 		if (lheads[i]) {
 			j = 0;
 			list_for_each(lh, lheads[i]) {
 				if (spl_max_show_tasks != 0 &&
 				    j >= spl_max_show_tasks) {
 					seq_printf(f, "\n\t(truncated)");
 					break;
 				}
 				/* show the wait waitq list */
 				if (i == LHEAD_WAIT) {
 #ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY
 					wq = list_entry(lh,
 					    spl_wait_queue_entry_t, entry);
 #else
 					wq = list_entry(lh,
 					    spl_wait_queue_entry_t, task_list);
 #endif
 					if (j == 0)
 						seq_printf(f, "\t%s:",
 						    list_names[i]);
 					else if (j % 8 == 0)
 						seq_printf(f, "\n\t     ");
 
 					tsk = wq->private;
 					seq_printf(f, " %d", tsk->pid);
 				/* pend, prio and delay lists */
 				} else {
 					tqe = list_entry(lh, taskq_ent_t,
 					    tqent_list);
 					if (j == 0)
 						seq_printf(f, "\t%s:",
 						    list_names[i]);
 					else if (j % 2 == 0)
 						seq_printf(f, "\n\t     ");
 
 					seq_printf(f, " %pf(%ps)",
 					    tqe->tqent_func,
 					    tqe->tqent_arg);
 				}
 				++j;
 			}
 			seq_printf(f, "\n");
 		}
 	if (lheads[LHEAD_WAIT])
 		spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (0);
 }
 
 static int
 taskq_all_seq_show(struct seq_file *f, void *p)
 {
 	return (taskq_seq_show_impl(f, p, B_TRUE));
 }
 
 static int
 taskq_seq_show(struct seq_file *f, void *p)
 {
 	return (taskq_seq_show_impl(f, p, B_FALSE));
 }
 
 static void *
 taskq_seq_start(struct seq_file *f, loff_t *pos)
 {
 	struct list_head *p;
 	loff_t n = *pos;
 
 	down_read(&tq_list_sem);
 	if (!n)
 		taskq_seq_show_headers(f);
 
 	p = tq_list.next;
 	while (n--) {
 		p = p->next;
 		if (p == &tq_list)
 		return (NULL);
 	}
 
 	return (list_entry(p, taskq_t, tq_taskqs));
 }
 
 static void *
 taskq_seq_next(struct seq_file *f, void *p, loff_t *pos)
 {
 	taskq_t *tq = p;
 
 	++*pos;
 	return ((tq->tq_taskqs.next == &tq_list) ?
 	    NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs));
 }
 
 static void
 slab_seq_show_headers(struct seq_file *f)
 {
 	seq_printf(f,
 	    "--------------------- cache ----------"
 	    "---------------------------------------------  "
 	    "----- slab ------  "
 	    "---- object -----  "
 	    "--- emergency ---\n");
 	seq_printf(f,
 	    "name                                  "
 	    "  flags      size     alloc slabsize  objsize  "
 	    "total alloc   max  "
 	    "total alloc   max  "
 	    "dlock alloc   max\n");
 }
 
 static int
 slab_seq_show(struct seq_file *f, void *p)
 {
 	spl_kmem_cache_t *skc = p;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 
 	if (skc->skc_flags & KMC_SLAB) {
 		/*
 		 * This cache is backed by a generic Linux kmem cache which
 		 * has its own accounting. For these caches we only track
 		 * the number of active allocated objects that exist within
 		 * the underlying Linux slabs. For the overall statistics of
 		 * the underlying Linux cache please refer to /proc/slabinfo.
 		 */
 		spin_lock(&skc->skc_lock);
 		uint64_t objs_allocated =
 		    percpu_counter_sum(&skc->skc_linux_alloc);
 		seq_printf(f, "%-36s  ", skc->skc_name);
 		seq_printf(f, "0x%05lx %9s %9lu %8s %8u  "
 		    "%5s %5s %5s  %5s %5lu %5s  %5s %5s %5s\n",
 		    (long unsigned)skc->skc_flags,
 		    "-",
 		    (long unsigned)(skc->skc_obj_size * objs_allocated),
 		    "-",
 		    (unsigned)skc->skc_obj_size,
 		    "-", "-", "-", "-",
 		    (long unsigned)objs_allocated,
 		    "-", "-", "-", "-");
 		spin_unlock(&skc->skc_lock);
 		return (0);
 	}
 
 	spin_lock(&skc->skc_lock);
 	seq_printf(f, "%-36s  ", skc->skc_name);
 	seq_printf(f, "0x%05lx %9lu %9lu %8u %8u  "
 	    "%5lu %5lu %5lu  %5lu %5lu %5lu  %5lu %5lu %5lu\n",
 	    (long unsigned)skc->skc_flags,
 	    (long unsigned)(skc->skc_slab_size * skc->skc_slab_total),
 	    (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc),
 	    (unsigned)skc->skc_slab_size,
 	    (unsigned)skc->skc_obj_size,
 	    (long unsigned)skc->skc_slab_total,
 	    (long unsigned)skc->skc_slab_alloc,
 	    (long unsigned)skc->skc_slab_max,
 	    (long unsigned)skc->skc_obj_total,
 	    (long unsigned)skc->skc_obj_alloc,
 	    (long unsigned)skc->skc_obj_max,
 	    (long unsigned)skc->skc_obj_deadlock,
 	    (long unsigned)skc->skc_obj_emergency,
 	    (long unsigned)skc->skc_obj_emergency_max);
 	spin_unlock(&skc->skc_lock);
 	return (0);
 }
 
 static void *
 slab_seq_start(struct seq_file *f, loff_t *pos)
 {
 	struct list_head *p;
 	loff_t n = *pos;
 
 	down_read(&spl_kmem_cache_sem);
 	if (!n)
 		slab_seq_show_headers(f);
 
 	p = spl_kmem_cache_list.next;
 	while (n--) {
 		p = p->next;
 		if (p == &spl_kmem_cache_list)
 			return (NULL);
 	}
 
 	return (list_entry(p, spl_kmem_cache_t, skc_list));
 }
 
 static void *
 slab_seq_next(struct seq_file *f, void *p, loff_t *pos)
 {
 	spl_kmem_cache_t *skc = p;
 
 	++*pos;
 	return ((skc->skc_list.next == &spl_kmem_cache_list) ?
 	    NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list));
 }
 
 static void
 slab_seq_stop(struct seq_file *f, void *v)
 {
 	up_read(&spl_kmem_cache_sem);
 }
 
 static const struct seq_operations slab_seq_ops = {
 	.show  = slab_seq_show,
 	.start = slab_seq_start,
 	.next  = slab_seq_next,
 	.stop  = slab_seq_stop,
 };
 
 static int
 proc_slab_open(struct inode *inode, struct file *filp)
 {
 	return (seq_open(filp, &slab_seq_ops));
 }
 
 static const kstat_proc_op_t proc_slab_operations = {
 #ifdef HAVE_PROC_OPS_STRUCT
 	.proc_open	= proc_slab_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
 	.proc_release	= seq_release,
 #else
 	.open		= proc_slab_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 #endif
 };
 
 static void
 taskq_seq_stop(struct seq_file *f, void *v)
 {
 	up_read(&tq_list_sem);
 }
 
 static const struct seq_operations taskq_all_seq_ops = {
 	.show	= taskq_all_seq_show,
 	.start	= taskq_seq_start,
 	.next	= taskq_seq_next,
 	.stop	= taskq_seq_stop,
 };
 
 static const struct seq_operations taskq_seq_ops = {
 	.show	= taskq_seq_show,
 	.start	= taskq_seq_start,
 	.next	= taskq_seq_next,
 	.stop	= taskq_seq_stop,
 };
 
 static int
 proc_taskq_all_open(struct inode *inode, struct file *filp)
 {
 	return (seq_open(filp, &taskq_all_seq_ops));
 }
 
 static int
 proc_taskq_open(struct inode *inode, struct file *filp)
 {
 	return (seq_open(filp, &taskq_seq_ops));
 }
 
 static const kstat_proc_op_t proc_taskq_all_operations = {
 #ifdef HAVE_PROC_OPS_STRUCT
 	.proc_open	= proc_taskq_all_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
 	.proc_release	= seq_release,
 #else
 	.open		= proc_taskq_all_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 #endif
 };
 
 static const kstat_proc_op_t proc_taskq_operations = {
 #ifdef HAVE_PROC_OPS_STRUCT
 	.proc_open	= proc_taskq_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
 	.proc_release	= seq_release,
 #else
 	.open		= proc_taskq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 #endif
 };
 
 static struct ctl_table spl_kmem_table[] = {
 #ifdef DEBUG_KMEM
 	{
 		.procname	= "kmem_used",
 		.data		= &kmem_alloc_used,
 #ifdef HAVE_ATOMIC64_T
 		.maxlen		= sizeof (atomic64_t),
 #else
 		.maxlen		= sizeof (atomic_t),
 #endif /* HAVE_ATOMIC64_T */
 		.mode		= 0444,
 		.proc_handler	= &proc_domemused,
 	},
 	{
 		.procname	= "kmem_max",
 		.data		= &kmem_alloc_max,
 		.maxlen		= sizeof (unsigned long),
 		.extra1		= &table_min,
 		.extra2		= &table_max,
 		.mode		= 0444,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
 #endif /* DEBUG_KMEM */
 	{
 		.procname	= "slab_kvmem_total",
 		.data		= (void *)(KMC_KVMEM | KMC_TOTAL),
 		.maxlen		= sizeof (unsigned long),
 		.extra1		= &table_min,
 		.extra2		= &table_max,
 		.mode		= 0444,
 		.proc_handler	= &proc_doslab,
 	},
 	{
 		.procname	= "slab_kvmem_alloc",
 		.data		= (void *)(KMC_KVMEM | KMC_ALLOC),
 		.maxlen		= sizeof (unsigned long),
 		.extra1		= &table_min,
 		.extra2		= &table_max,
 		.mode		= 0444,
 		.proc_handler	= &proc_doslab,
 	},
 	{
 		.procname	= "slab_kvmem_max",
 		.data		= (void *)(KMC_KVMEM | KMC_MAX),
 		.maxlen		= sizeof (unsigned long),
 		.extra1		= &table_min,
 		.extra2		= &table_max,
 		.mode		= 0444,
 		.proc_handler	= &proc_doslab,
 	},
 	{},
 };
 
 static struct ctl_table spl_kstat_table[] = {
 	{},
 };
 
 static struct ctl_table spl_table[] = {
 	/*
 	 * NB No .strategy entries have been provided since
 	 * sysctl(8) prefers to go via /proc for portability.
 	 */
 	{
 		.procname	= "gitrev",
 		.data		= (char *)ZFS_META_GITREV,
 		.maxlen		= sizeof (ZFS_META_GITREV),
 		.mode		= 0444,
 		.proc_handler	= &proc_dostring,
 	},
 	{
 		.procname	= "hostid",
 		.data		= &spl_hostid,
 		.maxlen		= sizeof (unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &proc_dohostid,
 	},
 #ifdef HAVE_REGISTER_SYSCTL_TABLE
 	{
 		.procname	= "kmem",
 		.mode		= 0555,
 		.child		= spl_kmem_table,
 	},
 	{
 		.procname	= "kstat",
 		.mode		= 0555,
 		.child		= spl_kstat_table,
 	},
 #endif
 	{},
 };
 
 #ifdef HAVE_REGISTER_SYSCTL_TABLE
 static struct ctl_table spl_dir[] = {
 	{
 		.procname	= "spl",
 		.mode		= 0555,
 		.child		= spl_table,
 	},
 	{}
 };
 
 static struct ctl_table spl_root[] = {
 	{
 		.procname	= "kernel",
 		.mode		= 0555,
 		.child		= spl_dir,
 	},
 	{}
 };
 #endif
 
 static void spl_proc_cleanup(void)
 {
 	remove_proc_entry("kstat", proc_spl);
 	remove_proc_entry("slab", proc_spl_kmem);
 	remove_proc_entry("kmem", proc_spl);
 	remove_proc_entry("taskq-all", proc_spl);
 	remove_proc_entry("taskq", proc_spl);
 	remove_proc_entry("spl", NULL);
 
 #ifndef HAVE_REGISTER_SYSCTL_TABLE
 	if (spl_kstat) {
 		unregister_sysctl_table(spl_kstat);
 		spl_kstat = NULL;
 	}
 	if (spl_kmem) {
 		unregister_sysctl_table(spl_kmem);
 		spl_kmem = NULL;
 	}
 #endif
 	if (spl_header) {
 		unregister_sysctl_table(spl_header);
 		spl_header = NULL;
 	}
 }
 
+#ifndef HAVE_REGISTER_SYSCTL_TABLE
+
+/*
+ * Traditionally, struct ctl_table arrays have been terminated by an "empty"
+ * sentinel element (specifically, one with .procname == NULL).
+ *
+ * Linux 6.6 began migrating away from this, adding register_sysctl_sz() so
+ * that callers could provide the size directly, and redefining
+ * register_sysctl() to just call register_sysctl_sz() with the array size. It
+ * retained support for the terminating element so that existing callers would
+ * continue to work.
+ *
+ * Linux 6.11 removed support for the terminating element, instead interpreting
+ * it as a real malformed element, and rejecting it.
+ *
+ * In order to continue support older kernels, we retain the terminating
+ * sentinel element for our sysctl tables, but instead detect availability of
+ * register_sysctl_sz(). If it exists, we pass it the array size -1, stopping
+ * the kernel from trying to process the terminator. For pre-6.6 kernels that
+ * don't have register_sysctl_sz(), we just use register_sysctl(), which can
+ * handle the terminating element as it always has.
+ */
+#ifdef HAVE_REGISTER_SYSCTL_SZ
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl_sz(p, t, ARRAY_SIZE(t)-1)
+#else
+#define	spl_proc_register_sysctl(p, t)	\
+	register_sysctl(p, t)
+#endif
+#endif
+
 int
 spl_proc_init(void)
 {
 	int rc = 0;
 
 #ifdef HAVE_REGISTER_SYSCTL_TABLE
 	spl_header = register_sysctl_table(spl_root);
 	if (spl_header == NULL)
 		return (-EUNATCH);
 #else
-	spl_header = register_sysctl("kernel/spl", spl_table);
+	spl_header = spl_proc_register_sysctl("kernel/spl", spl_table);
 	if (spl_header == NULL)
 		return (-EUNATCH);
 
-	spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table);
+	spl_kmem = spl_proc_register_sysctl("kernel/spl/kmem", spl_kmem_table);
 	if (spl_kmem == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
-	spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table);
+	spl_kstat = spl_proc_register_sysctl("kernel/spl/kstat",
+	    spl_kstat_table);
 	if (spl_kstat == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 #endif
 
 	proc_spl = proc_mkdir("spl", NULL);
 	if (proc_spl == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 
 	proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl,
 	    &proc_taskq_all_operations, NULL);
 	if (proc_spl_taskq_all == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 
 	proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl,
 	    &proc_taskq_operations, NULL);
 	if (proc_spl_taskq == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 
 	proc_spl_kmem = proc_mkdir("kmem", proc_spl);
 	if (proc_spl_kmem == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 
 	proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem,
 	    &proc_slab_operations, NULL);
 	if (proc_spl_kmem_slab == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 
 	proc_spl_kstat = proc_mkdir("kstat", proc_spl);
 	if (proc_spl_kstat == NULL) {
 		rc = -EUNATCH;
 		goto out;
 	}
 out:
 	if (rc)
 		spl_proc_cleanup();
 
 	return (rc);
 }
 
 void
 spl_proc_fini(void)
 {
 	spl_proc_cleanup();
 }
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
index be528f6e8176..fb871ed8cef6 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zfs_vnops_os.c
@@ -1,4245 +1,4246 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/vmsystm.h>
 #include <sys/atomic.h>
 #include <sys/pathname.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_rlock.h>
 #include <sys/cred.h>
 #include <sys/zpl.h>
 #include <sys/zil.h>
 #include <sys/sa_impl.h>
+#include <linux/mm_compat.h>
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1) A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using zfs_enter(zfsvfs).
  *      A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  *      must be checked with zfs_verify_zp(zp).  Both of these macros
  *      can return EIO from the calling function.
  *
  *  (2) zrele() should always be the last thing except for zil_commit() (if
  *	necessary) and zfs_exit(). This is for 3 reasons: First, if it's the
  *	last reference, the vnode/znode can be freed, so the zp may point to
  *	freed memory.  Second, the last reference will call zfs_zinactive(),
  *	which may induce a lot of work -- pushing cached pages (which acquires
  *	range locks) and syncing out cached atime changes.  Third,
  *	zfs_zinactive() may require a new tx, which could deadlock the system
  *	if you were already holding one. This deadlock occurs because the tx
  *	currently being operated on prevents a txg from syncing, which
  *	prevents the new tx from progressing, resulting in a deadlock.  If you
  *	must call zrele() within a tx, use zfs_zrele_async(). Note that iput()
  *	is a synonym for zrele().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT.  This
  *	reduces lock contention and CPU usage when we must wait (note that if
  *	throughput is constrained by the storage, nearly every transaction
  *	must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
  *	forever, because the previous txg can't quiesce until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
  *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	zfs_enter(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may igrab())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		zrele(...);		// release held znodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		zfs_exit(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	zrele(...);			// release held znodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	zfs_exit(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 int
 zfs_open(struct inode *ip, int mode, int flag, cred_t *cr)
 {
 	(void) cr;
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/* Honor ZFS_APPENDONLY file attribute */
 	if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & O_APPEND) == 0)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Keep a count of the synchronous opens in the znode */
 	if (flag & O_SYNC)
 		atomic_inc_32(&zp->z_sync_cnt);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 int
 zfs_close(struct inode *ip, int flag, cred_t *cr)
 {
 	(void) cr;
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/* Decrement the synchronous opens in the znode */
 	if (flag & O_SYNC)
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 #if defined(_KERNEL)
 
 static int zfs_fillpage(struct inode *ip, struct page *pp);
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  Update all mapped
  * pages with the contents of the coresponding dmu buffer.
  */
 void
 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 {
 	struct address_space *mp = ZTOI(zp)->i_mapping;
 	int64_t off = start & (PAGE_SIZE - 1);
 
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 		uint64_t nbytes = MIN(PAGE_SIZE - off, len);
 
 		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			void *pb = kmap(pp);
 			int error = dmu_read(os, zp->z_id, start + off,
 			    nbytes, pb + off, DMU_READ_PREFETCH);
 			kunmap(pp);
 
 			if (error) {
 				SetPageError(pp);
 				ClearPageUptodate(pp);
 			} else {
 				ClearPageError(pp);
 				SetPageUptodate(pp);
 
 				if (mapping_writably_mapped(mp))
 					flush_dcache_page(pp);
 
 				mark_page_accessed(pp);
 			}
 
 			unlock_page(pp);
 			put_page(pp);
 		}
 
 		len -= nbytes;
 		off = 0;
 	}
 }
 
 /*
  * When a file is memory mapped, we must keep the I/O data synchronized
  * between the DMU cache and the memory mapped pages.  Preferentially read
  * from memory mapped pages, otherwise fallback to reading through the dmu.
  */
 int
 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	struct inode *ip = ZTOI(zp);
 	struct address_space *mp = ip->i_mapping;
 	int64_t start = uio->uio_loffset;
 	int64_t off = start & (PAGE_SIZE - 1);
 	int len = nbytes;
 	int error = 0;
 
 	for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) {
 		uint64_t bytes = MIN(PAGE_SIZE - off, len);
 
 		struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT);
 		if (pp) {
 			/*
 			 * If filemap_fault() retries there exists a window
 			 * where the page will be unlocked and not up to date.
 			 * In this case we must try and fill the page.
 			 */
 			if (unlikely(!PageUptodate(pp))) {
 				error = zfs_fillpage(ip, pp);
 				if (error) {
 					unlock_page(pp);
 					put_page(pp);
 					return (error);
 				}
 			}
 
 			ASSERT(PageUptodate(pp) || PageDirty(pp));
 
 			unlock_page(pp);
 
 			void *pb = kmap(pp);
 			error = zfs_uiomove(pb + off, bytes, UIO_READ, uio);
 			kunmap(pp);
 
 			if (mapping_writably_mapped(mp))
 				flush_dcache_page(pp);
 
 			mark_page_accessed(pp);
 			put_page(pp);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes);
 		}
 
 		len -= bytes;
 		off = 0;
 
 		if (error)
 			break;
 	}
 
 	return (error);
 }
 #endif /* _KERNEL */
 
 static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT;
 
 /*
  * Write the bytes to a file.
  *
  *	IN:	zp	- znode of file to be written to
  *		data	- bytes to write
  *		len	- number of bytes to write
  *		pos	- offset to start writing at
  *
  *	OUT:	resid	- remaining bytes to write
  *
  *	RETURN:	0 if success
  *		positive error code if failure.  EIO is	returned
  *		for a short write when residp isn't provided.
  *
  * Timestamps:
  *	zp - ctime|mtime updated if byte count > 0
  */
 int
 zfs_write_simple(znode_t *zp, const void *data, size_t len,
     loff_t pos, size_t *residp)
 {
 	fstrans_cookie_t cookie;
 	int error;
 
 	struct iovec iov;
 	iov.iov_base = (void *)data;
 	iov.iov_len = len;
 
 	zfs_uio_t uio;
 	zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0);
 
 	cookie = spl_fstrans_mark();
 	error = zfs_write(zp, &uio, 0, kcred);
 	spl_fstrans_unmark(cookie);
 
 	if (error == 0) {
 		if (residp != NULL)
 			*residp = zfs_uio_resid(&uio);
 		else if (zfs_uio_resid(&uio) != 0)
 			error = SET_ERROR(EIO);
 	}
 
 	return (error);
 }
 
 static void
 zfs_rele_async_task(void *arg)
 {
 	iput(arg);
 }
 
 void
 zfs_zrele_async(znode_t *zp)
 {
 	struct inode *ip = ZTOI(zp);
 	objset_t *os = ITOZSB(ip)->z_os;
 
 	ASSERT(atomic_read(&ip->i_count) > 0);
 	ASSERT(os != NULL);
 
 	/*
 	 * If decrementing the count would put us at 0, we can't do it inline
 	 * here, because that would be synchronous. Instead, dispatch an iput
 	 * to run later.
 	 *
 	 * For more information on the dangers of a synchronous iput, see the
 	 * header comment of this file.
 	 */
 	if (!atomic_add_unless(&ip->i_count, -1, 1)) {
 		VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)),
 		    zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID);
 	}
 }
 
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held inode reference for it.
  *
  *	IN:	zdp	- znode of directory to search.
  *		nm	- name of entry to lookup.
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		cr	- credentials of caller.
  *		direntflags - directory lookup flags
  *		realpnp - returned pathname.
  *
  *	OUT:	zpp	- znode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 int
 zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr,
     int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zdp);
 	int error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
 
 		if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 
 		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
 			error = zfs_fastaccesschk_execute(zdp, cr);
 			if (!error) {
 				*zpp = zdp;
 				zhold(*zpp);
 				return (0);
 			}
 			return (error);
 		}
 	}
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 		return (error);
 
 	*zpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 
 		if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0,
 		    B_TRUE, cr, zfs_init_idmap))) {
 			zrele(*zpp);
 			*zpp = NULL;
 		}
 
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (!S_ISDIR(ZTOI(zdp)->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENOTDIR));
 	}
 
 	/*
 	 * Check accessibility of directory.
 	 */
 
 	if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 	    zfs_init_idmap))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp);
 	if ((error == 0) && (*zpp))
 		zfs_znode_update_vfs(*zpp);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the ip of the created or trunc'd file.
  *
  *	IN:	dzp	- znode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- file flag.
  *		vsecp	- ACL to be set
  *		mnt_ns	- user namespace of the mount
  *
  *	OUT:	zpp	- znode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dzp - ctime|mtime updated if new entry created
  *	 zp - ctime|mtime always, atime if new
  */
 int
 zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl,
     int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp,
     zidmap_t *mnt_ns)
 {
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	objset_t	*os;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid;
 	gid_t		gid;
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 	boolean_t	skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	gid = crgetgid(cr);
 	uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 top:
 	*zpp = NULL;
 	if (*name == '\0') {
 		/*
 		 * Null component name refers to the directory itself.
 		 */
 		zhold(dzp);
 		zp = dzp;
 		dl = NULL;
 		error = 0;
 	} else {
 		/* possible igrab(zp) */
 		int zflg = 0;
 
 		if (flag & FIGNORECASE)
 			zflg |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 		    NULL, NULL);
 		if (error) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			if (strcmp(name, "..") == 0)
 				error = SET_ERROR(EISDIR);
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if (zp == NULL) {
 		uint64_t txtype;
 		uint64_t projid = ZFS_DEFAULT_PROJID;
 
 		/*
 		 * Create a new file object and update the directory
 		 * to reference it.
 		 */
 		if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr,
 		    mnt_ns))) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			goto out;
 		}
 
 		/*
 		 * We only support the creation of regular files in
 		 * extended attribute directories.
 		 */
 
 		if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) {
 			if (have_acl)
 				zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 		    cr, vsecp, &acl_ids, mnt_ns)) != 0)
 			goto out;
 		have_acl = B_TRUE;
 
 		if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 			projid = zfs_inherit_projid(dzp);
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 			zfs_acl_ids_free(&acl_ids);
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 
 		tx = dmu_tx_create(os);
 
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		fuid_dirtied = zfsvfs->z_fuid_dirty;
 		if (fuid_dirtied)
 			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 
 		error = dmu_tx_assign(tx,
 		    (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
 				waited = B_TRUE;
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_abort(tx);
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 		error = zfs_link_create(dl, zp, tx, ZNEW);
 		if (error != 0) {
 			/*
 			 * Since, we failed to add the directory entry for it,
 			 * delete the newly created dnode.
 			 */
 			zfs_znode_delete(zp, tx);
 			remove_inode_hash(ZTOI(zp));
 			zfs_acl_ids_free(&acl_ids);
 			dmu_tx_commit(tx);
 			goto out;
 		}
 
 		if (fuid_dirtied)
 			zfs_fuid_sync(zfsvfs, tx);
 
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 		    vsecp, acl_ids.z_fuidp, vap);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & O_APPEND) ? V_APPEND : 0;
 
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 
 		/*
 		 * A directory entry already exists for this name.
 		 */
 		/*
 		 * Can't truncate an existing file if in exclusive mode.
 		 */
 		if (excl) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Can't open a directory for writing.
 		 */
 		if (S_ISDIR(ZTOI(zp)->i_mode)) {
 			error = SET_ERROR(EISDIR);
 			goto out;
 		}
 		/*
 		 * Verify requested access to file.
 		 */
 		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr,
 		    mnt_ns))) {
 			goto out;
 		}
 
 		mutex_enter(&dzp->z_lock);
 		dzp->z_seq++;
 		mutex_exit(&dzp->z_lock);
 
 		/*
 		 * Truncate regular files if requested.
 		 */
 		if (S_ISREG(ZTOI(zp)->i_mode) &&
 		    (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) {
 			/* we can't hold any locks when calling zfs_freesp() */
 			if (dl) {
 				zfs_dirent_unlock(dl);
 				dl = NULL;
 			}
 			error = zfs_freesp(zp, 0, 0, mode, TRUE);
 		}
 	}
 out:
 
 	if (dl)
 		zfs_dirent_unlock(dl);
 
 	if (error) {
 		if (zp)
 			zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 		*zpp = zp;
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 int
 zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl,
     int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp,
     zidmap_t *mnt_ns)
 {
 	(void) excl, (void) mode, (void) flag;
 	znode_t		*zp = NULL, *dzp = ITOZ(dip);
 	zfsvfs_t	*zfsvfs = ITOZSB(dip);
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid;
 	gid_t		gid;
 	zfs_acl_ids_t   acl_ids;
 	uint64_t	projid = ZFS_DEFAULT_PROJID;
 	boolean_t	fuid_dirtied;
 	boolean_t	have_acl = B_FALSE;
 	boolean_t	waited = B_FALSE;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	gid = crgetgid(cr);
 	uid = crgetuid(cr);
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	os = zfsvfs->z_os;
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 top:
 	*ipp = NULL;
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		if (have_acl)
 			zfs_acl_ids_free(&acl_ids);
 		goto out;
 	}
 
 	if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids, mnt_ns)) != 0)
 		goto out;
 	have_acl = B_TRUE;
 
 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 		projid = zfs_inherit_projid(dzp);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	/* Add to unlinked set */
 	zp->z_unlinked = B_TRUE;
 	zfs_unlinked_add(zp, tx);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 out:
 
 	if (error) {
 		if (zp)
 			zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 		*ipp = ZTOI(zp);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dzp	- znode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		flags	- case flags.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dzp - ctime|mtime
  *	 ip - ctime (if nlink > 0)
  */
 
 static uint64_t null_xattr = 0;
 
 int
 zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags)
 {
 	znode_t		*zp;
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	uint64_t	acl_obj, xattr_obj;
 	uint64_t	xattr_obj_unlinked = 0;
 	uint64_t	obj = 0;
 	uint64_t	links;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	boolean_t	may_delete_now, delete_now = FALSE;
 	boolean_t	unlinked, toobig = FALSE;
 	uint64_t	txtype;
 	pathname_t	*realnmp = NULL;
 	pathname_t	realnm;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE) {
 		zflg |= ZCILOOK;
 		pn_alloc(&realnm);
 		realnmp = &realnm;
 	}
 
 top:
 	xattr_obj = 0;
 	xzp = NULL;
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, realnmp))) {
 		if (realnmp)
 			pn_free(realnmp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (S_ISDIR(ZTOI(zp)->i_mode)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	mutex_enter(&zp->z_lock);
 	may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 &&
 	    !zn_has_cached_data(zp, 0, LLONG_MAX);
 	mutex_exit(&zp->z_lock);
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the inode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	obj = zp->z_id;
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	if (may_delete_now) {
 		toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks;
 		/* if the file is too big, only hold_free a token amount */
 		dmu_tx_hold_free(tx, zp->z_id, 0,
 		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
 	}
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	mutex_enter(&zp->z_lock);
 	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 	mutex_exit(&zp->z_lock);
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			zrele(zp);
 			if (xzp)
 				zrele(xzp);
 			goto top;
 		}
 		if (realnmp)
 			pn_free(realnmp);
 		dmu_tx_abort(tx);
 		zrele(zp);
 		if (xzp)
 			zrele(xzp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		/*
 		 * Hold z_lock so that we can make sure that the ACL obj
 		 * hasn't changed.  Could have been deleted due to
 		 * zfs_sa_upgrade().
 		 */
 		mutex_enter(&zp->z_lock);
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
 		delete_now = may_delete_now && !toobig &&
 		    atomic_read(&ZTOI(zp)->i_count) == 1 &&
 		    !zn_has_cached_data(zp, 0, LLONG_MAX) &&
 		    xattr_obj == xattr_obj_unlinked &&
 		    zfs_external_acl(zp) == acl_obj;
 		VERIFY_IMPLY(xattr_obj_unlinked, xzp);
 	}
 
 	if (delete_now) {
 		if (xattr_obj_unlinked) {
 			ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2);
 			mutex_enter(&xzp->z_lock);
 			xzp->z_unlinked = B_TRUE;
 			clear_nlink(ZTOI(xzp));
 			links = 0;
 			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 			    &links, sizeof (links), tx);
 			ASSERT3U(error,  ==,  0);
 			mutex_exit(&xzp->z_lock);
 			zfs_unlinked_add(xzp, tx);
 
 			if (zp->z_is_sa)
 				error = sa_remove(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), tx);
 			else
 				error = sa_update(zp->z_sa_hdl,
 				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
 				    sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		/*
 		 * Add to the unlinked set because a new reference could be
 		 * taken concurrently resulting in a deferred destruction.
 		 */
 		zfs_unlinked_add(zp, tx);
 		mutex_exit(&zp->z_lock);
 	} else if (unlinked) {
 		mutex_exit(&zp->z_lock);
 		zfs_unlinked_add(zp, tx);
 	}
 
 	txtype = TX_REMOVE;
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
 
 	dmu_tx_commit(tx);
 out:
 	if (realnmp)
 		pn_free(realnmp);
 
 	zfs_dirent_unlock(dl);
 	zfs_znode_update_vfs(dzp);
 	zfs_znode_update_vfs(zp);
 
 	if (delete_now)
 		zrele(zp);
 	else
 		zfs_zrele_async(zp);
 
 	if (xzp) {
 		zfs_znode_update_vfs(xzp);
 		zfs_zrele_async(xzp);
 	}
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Create a new directory and insert it into dzp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dzp	- znode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		flags	- case flags.
  *		vsecp	- ACL to be set
  *		mnt_ns	- user namespace of the mount
  *
  *	OUT:	zpp	- znode of created directory.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	dzp - ctime|mtime updated
  *	zpp - ctime|mtime|atime updated
  */
 int
 zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp,
     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 {
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(S_ISDIR(vap->va_mode));
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 
 	uid = crgetuid(cr);
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if (dirname == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	if (vap->va_mask & ATTR_XVATTR) {
 		if ((error = secpolicy_xvattr((xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_mode)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    vsecp, &acl_ids, mnt_ns)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 top:
 	*zpp = NULL;
 
 	if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
 	    NULL, NULL))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
 	    mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	error = zfs_link_create(dl, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		remove_inode_hash(ZTOI(zp));
 		goto out;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	*zpp = zp;
 
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
 	    acl_ids.z_fuidp, vap);
 
 out:
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (error != 0) {
 		zrele(zp);
 	} else {
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 	}
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dzp	- znode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- inode of current working directory.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dzp - ctime|mtime updated
  */
 int
 zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr,
     int flags)
 {
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zflg = ZEXISTS;
 	boolean_t	waited = B_FALSE;
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 top:
 	zp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry doesn't exist.
 	 */
 	if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
 	    NULL, NULL))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) {
 		goto out;
 	}
 
 	if (!S_ISDIR(ZTOI(zp)->i_mode)) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	if (zp == cwd) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * Grab a lock on the directory to make sure that no one is
 	 * trying to add (or lookup) entries while we are removing it.
 	 */
 	rw_enter(&zp->z_name_lock, RW_WRITER);
 
 	/*
 	 * Grab a lock on the parent pointer to make sure we play well
 	 * with the treewalk and directory rename code.
 	 */
 	rw_enter(&zp->z_parent_lock, RW_WRITER);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			zrele(zp);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		zrele(zp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT,
 		    B_FALSE);
 	}
 
 	dmu_tx_commit(tx);
 
 	rw_exit(&zp->z_parent_lock);
 	rw_exit(&zp->z_name_lock);
 out:
 	zfs_dirent_unlock(dl);
 
 	zfs_znode_update_vfs(dzp);
 	zfs_znode_update_vfs(zp);
 	zrele(zp);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Read directory entries from the given directory cursor position and emit
  * name and position for each entry.
  *
  *	IN:	ip	- inode of directory to read.
  *		ctx	- directory entry context.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 int
 zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr)
 {
 	(void) cr;
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	objset_t	*os;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		done = 0;
 	uint64_t	parent;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0)
 		goto out;
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if (zp->z_unlinked)
 		goto out;
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = ctx->pos;
 	prefetch = zp->z_zn_prefetch;
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	while (!done) {
 		uint64_t objnum;
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap.za_name, ".");
 			zap.za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap.za_name, "..");
 			zap.za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
 			zap.za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if ((error = zap_cursor_retrieve(&zc, &zap))) {
 				if (error == ENOENT)
 					break;
 				else
 					goto update;
 			}
 
 			/*
 			 * Allow multiple entries provided the first entry is
 			 * the object id.  Non-zpl consumers may safely make
 			 * use of the additional space.
 			 *
 			 * XXX: This should be a feature flag for compatibility
 			 */
 			if (zap.za_integer_length != 8 ||
 			    zap.za_num_integers == 0) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld, "
 				    "length = %d, num = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset,
 				    zap.za_integer_length,
 				    (u_longlong_t)zap.za_num_integers);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
 			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 		}
 
 		done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name),
 		    objnum, type);
 		if (done)
 			break;
 
 		if (prefetch)
 			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 		ctx->pos = offset;
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 update:
 	zap_cursor_fini(&zc);
 	if (error == ENOENT)
 		error = 0;
 out:
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Get the basic file attributes and place them in the provided kstat
  * structure.  The inode is assumed to be the authoritative source
  * for most of the attributes.  However, the znode currently has the
  * authoritative atime, blksize, and block count.
  *
  *	IN:	ip	- inode of file.
  *
  *	OUT:	sp	- kstat values.
  *
  *	RETURN:	0 (always succeeds)
  */
 int
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip,
     struct kstat *sp)
 #else
 zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp)
 #endif
 {
 	znode_t *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	mutex_enter(&zp->z_lock);
 
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 	zpl_generic_fillattr(user_ns, request_mask, ip, sp);
 #else
 	zpl_generic_fillattr(user_ns, ip, sp);
 #endif
 	/*
 	 * +1 link count for root inode with visible '.zfs' directory.
 	 */
 	if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp))
 		if (sp->nlink < ZFS_LINK_MAX)
 			sp->nlink++;
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	sp->blksize = blksize;
 	sp->blocks = nblocks;
 
 	if (unlikely(zp->z_blksz == 0)) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		sp->blksize = zfsvfs->z_max_blksz;
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	/*
 	 * Required to prevent NFS client from detecting different inode
 	 * numbers of snapshot root dentry before and after snapshot mount.
 	 */
 	if (zfsvfs->z_issnap) {
 		if (ip->i_sb->s_root->d_inode == ip)
 			sp->ino = ZFSCTL_INO_SNAPDIRS -
 			    dmu_objset_id(zfsvfs->z_os);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (0);
 }
 
 /*
  * For the operation of changing file's user/group/project, we need to
  * handle not only the main object that is assigned to the file directly,
  * but also the ones that are used by the file via hidden xattr directory.
  *
  * Because the xattr directory may contains many EA entries, as to it may
  * be impossible to change all of them via the transaction of changing the
  * main object's user/group/project attributes. Then we have to change them
  * via other multiple independent transactions one by one. It may be not good
  * solution, but we have no better idea yet.
  */
 static int
 zfs_setattr_dir(znode_t *dzp)
 {
 	struct inode	*dxip = ZTOI(dzp);
 	struct inode	*xip = NULL;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	objset_t	*os = zfsvfs->z_os;
 	zap_cursor_t	zc;
 	zap_attribute_t	zap;
 	zfs_dirlock_t	*dl;
 	znode_t		*zp = NULL;
 	dmu_tx_t	*tx = NULL;
 	uint64_t	uid, gid;
 	sa_bulk_attr_t	bulk[4];
 	int		count;
 	int		err;
 
 	zap_cursor_init(&zc, os, dzp->z_id);
 	while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) {
 		count = 0;
 		if (zap.za_integer_length != 8 || zap.za_num_integers != 1) {
 			err = ENXIO;
 			break;
 		}
 
 		err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp,
 		    ZEXISTS, NULL, NULL);
 		if (err == ENOENT)
 			goto next;
 		if (err)
 			break;
 
 		xip = ZTOI(zp);
 		if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) &&
 		    KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) &&
 		    zp->z_projid == dzp->z_projid)
 			goto next;
 
 		tx = dmu_tx_create(os);
 		if (!(zp->z_pflags & ZFS_PROJID))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err)
 			break;
 
 		mutex_enter(&dzp->z_lock);
 
 		if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) {
 			xip->i_uid = dxip->i_uid;
 			uid = zfs_uid_read(dxip);
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &uid, sizeof (uid));
 		}
 
 		if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) {
 			xip->i_gid = dxip->i_gid;
 			gid = zfs_gid_read(dxip);
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 			    &gid, sizeof (gid));
 		}
 
 		if (zp->z_projid != dzp->z_projid) {
 			if (!(zp->z_pflags & ZFS_PROJID)) {
 				zp->z_pflags |= ZFS_PROJID;
 				SA_ADD_BULK_ATTR(bulk, count,
 				    SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags,
 				    sizeof (zp->z_pflags));
 			}
 
 			zp->z_projid = dzp->z_projid;
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs),
 			    NULL, &zp->z_projid, sizeof (zp->z_projid));
 		}
 
 		mutex_exit(&dzp->z_lock);
 
 		if (likely(count > 0)) {
 			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
 		tx = NULL;
 		if (err != 0 && err != ENOENT)
 			break;
 
 next:
 		if (zp) {
 			zrele(zp);
 			zp = NULL;
 			zfs_dirent_unlock(dl);
 		}
 		zap_cursor_advance(&zc);
 	}
 
 	if (tx)
 		dmu_tx_abort(tx);
 	if (zp) {
 		zrele(zp);
 		zfs_dirent_unlock(dl);
 	}
 	zap_cursor_fini(&zc);
 
 	return (err == ENOENT ? 0 : err);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	zp	- znode of file to be modified.
  *		vap	- new attribute values.
  *			  If ATTR_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		mnt_ns	- user namespace of the mount
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime updated, mtime updated if size changed.
  */
 int
 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 {
 	struct inode	*ip;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	objset_t	*os;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	*tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_kuid = 0, new_kgid = 0, new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2], atime[2];
 	uint64_t	projid = ZFS_INVALID_PROJID;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2 = 0;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	boolean_t	handle_eadir = B_FALSE;
 	sa_bulk_attr_t	*bulk, *xattr_bulk;
 	int		count = 0, xattr_count = 0, bulks = 8;
 
 	if (mask == 0)
 		return (0);
 
 	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (err);
 	ip = ZTOI(zp);
 	os = zfsvfs->z_os;
 
 	/*
 	 * If this is a xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 	if (xoap != NULL && (mask & ATTR_XVATTR)) {
 		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 			if (!dmu_objset_projectquota_enabled(os) ||
 			    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 
 			projid = xoap->xoa_projid;
 			if (unlikely(projid == ZFS_INVALID_PROJID)) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
 				projid = ZFS_INVALID_PROJID;
 			else
 				need_policy = TRUE;
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
 		    (xoap->xoa_projinherit !=
 		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
 		    (!dmu_objset_projectquota_enabled(os) ||
 		    (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & ATTR_XVATTR))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP);
 	xva_init(tmpxvattr);
 
 	bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
 	xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) ||
 	    ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		err = SET_ERROR(EPERM);
 		goto out3;
 	}
 
 	if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
 		err = SET_ERROR(EPERM);
 		goto out3;
 	}
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (ATTR_ATIME | ATTR_MTIME)) {
 		if (((mask & ATTR_ATIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & ATTR_MTIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			err = SET_ERROR(EOVERFLOW);
 			goto out3;
 		}
 	}
 
 top:
 	attrzp = NULL;
 	aclp = NULL;
 
 	/* Can this be moved to before the top label? */
 	if (zfs_is_readonly(zfsvfs)) {
 		err = SET_ERROR(EROFS);
 		goto out3;
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & ATTR_SIZE) {
 		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr,
 		    mnt_ns);
 		if (err)
 			goto out3;
 
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err)
 			goto out3;
 	}
 
 	if (mask & (ATTR_ATIME|ATTR_MTIME) ||
 	    ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr, mnt_ns);
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID)) {
 		int	idmask = (mask & (ATTR_UID|ATTR_GID));
 		int	take_owner;
 		int	take_group;
 		uid_t	uid;
 		gid_t	gid;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & ATTR_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip),
 		    vap->va_uid);
 		gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip),
 		    vap->va_gid);
 		take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr));
 		take_group = (mask & ATTR_GID) &&
 		    zfs_groupmember(zfsvfs, gid, cr);
 
 		/*
 		 * If both ATTR_UID and ATTR_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (ATTR_UID|ATTR_GID)) &&
 		    take_owner && take_group) ||
 		    ((idmask == ATTR_UID) && take_owner) ||
 		    ((idmask == ATTR_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr, mnt_ns) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				(void) secpolicy_setid_clear(vap, cr);
 				trim_mask = (mask & (ATTR_UID|ATTR_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	mutex_enter(&zp->z_lock);
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & ATTR_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 			if (xoap->xoa_projinherit !=
 			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
 				XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((!S_ISREG(ip->i_mode) &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			mutex_exit(&zp->z_lock);
 			err = SET_ERROR(EPERM);
 			goto out3;
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	mutex_exit(&zp->z_lock);
 
 	if (mask & ATTR_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
 		    mnt_ns) == 0) {
 			err = secpolicy_setid_setsticky_clear(ip, vap,
 			    &oldva, cr, mnt_ns, zfs_i_user_ns(ip));
 			if (err)
 				goto out3;
 			trim_mask |= ATTR_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 		}
 		err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags,
 		    zfs_zaccess_unix, zp);
 		if (err)
 			goto out3;
 
 		if (trim_mask)
 			vap->va_mask |= saved_mask;
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) {
 		handle_eadir = B_TRUE;
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp);
 			if (err)
 				goto out2;
 		}
 		if (mask & ATTR_UID) {
 			new_kuid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) &&
 			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
 			    new_kuid)) {
 				if (attrzp)
 					zrele(attrzp);
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & ATTR_GID) {
 			new_kgid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp);
 			if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) &&
 			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 			    new_kgid)) {
 				if (attrzp)
 					zrele(attrzp);
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (projid != ZFS_INVALID_PROJID &&
 		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
 			if (attrzp)
 				zrele(attrzp);
 			err = EDQUOT;
 			goto out2;
 		}
 	}
 	tx = dmu_tx_create(os);
 
 	if (mask & ATTR_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = EPERM;
 			goto out;
 		}
 
 		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
 			goto out;
 
 		mutex_enter(&zp->z_lock);
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		mutex_exit(&zp->z_lock);
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if (((mask & ATTR_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
 		    (projid != ZFS_INVALID_PROJID &&
 		    !(zp->z_pflags & ZFS_PROJID)))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
 		/*
 		 * For the existed object that is upgraded from old system,
 		 * its on-disk layout has no slot for the project ID attribute.
 		 * But quota accounting logic needs to access related slots by
 		 * offset directly. So we need to adjust old objects' layout
 		 * to make the project ID to some unified and fixed offset.
 		 */
 		if (attrzp)
 			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
 		if (err == 0)
 			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 
 		if (unlikely(err == EEXIST))
 			err = 0;
 		else if (err != 0)
 			goto out;
 		else
 			projid = ZFS_INVALID_PROJID;
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 		mutex_enter(&zp->z_acl_lock);
 	mutex_enter(&zp->z_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		mutex_enter(&attrzp->z_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 		if (projid != ZFS_INVALID_PROJID) {
 			attrzp->z_projid = projid;
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
 			    sizeof (attrzp->z_projid));
 		}
 	}
 
 	if (mask & (ATTR_UID|ATTR_GID)) {
 
 		if (mask & ATTR_UID) {
 			ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid);
 			new_uid = zfs_uid_read(ZTOI(zp));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid);
 			}
 		}
 
 		if (mask & ATTR_GID) {
 			ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid);
 			new_gid = zfs_gid_read(ZTOI(zp));
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid);
 			}
 		}
 		if (!(mask & ATTR_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT(err == 0);
 		if (attrzp) {
 			err = zfs_acl_chown_setattr(attrzp);
 			ASSERT(err == 0);
 		}
 	}
 
 	if (mask & ATTR_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = ZTOI(zp)->i_mode = new_mode;
 		ASSERT3P(aclp, !=, NULL);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 	if ((mask & ATTR_ATIME) || zp->z_atime_dirty) {
 		zp->z_atime_dirty = B_FALSE;
 		inode_timespec_t tmp_atime = zpl_inode_get_atime(ip);
 		ZFS_TIME_ENCODE(&tmp_atime, atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &atime, sizeof (atime));
 	}
 
 	if (mask & (ATTR_MTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		zpl_inode_set_mtime_to_ts(ZTOI(zp),
 		    zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp)));
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	if (mask & (ATTR_CTIME | ATTR_SIZE)) {
 		ZFS_TIME_ENCODE(&vap->va_ctime, ctime);
 		zpl_inode_set_ctime_to_ts(ZTOI(zp),
 		    zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    ctime, sizeof (ctime));
 	}
 
 	if (projid != ZFS_INVALID_PROJID) {
 		zp->z_projid = projid;
 		SA_ADD_BULK_ATTR(bulk, count,
 		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 		    sizeof (zp->z_projid));
 	}
 
 	if (attrzp && mask) {
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_CTIME(zfsvfs), NULL, &ctime,
 		    sizeof (ctime));
 	}
 
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & ATTR_XVATTR)) {
 
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 		if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) {
 			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT(S_ISREG(ip->i_mode));
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	mutex_exit(&zp->z_lock);
 	if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 		mutex_exit(&attrzp->z_lock);
 	}
 out:
 	if (err == 0 && xattr_count > 0) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT(err2 == 0);
 	}
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 		if (attrzp)
 			zrele(attrzp);
 		if (err == ERESTART)
 			goto top;
 	} else {
 		if (count > 0)
 			err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 		if (attrzp) {
 			if (err2 == 0 && handle_eadir)
 				err = zfs_setattr_dir(attrzp);
 			zrele(attrzp);
 		}
 		zfs_znode_update_vfs(zp);
 	}
 
 out2:
 	if (os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 out3:
 	kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks);
 	kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks);
 	kmem_free(tmpxvattr, sizeof (xvattr_t));
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
 }
 
 typedef struct zfs_zlock {
 	krwlock_t	*zl_rwlock;	/* lock we acquired */
 	znode_t		*zl_znode;	/* znode we held */
 	struct zfs_zlock *zl_next;	/* next in list */
 } zfs_zlock_t;
 
 /*
  * Drop locks and release vnodes that were held by zfs_rename_lock().
  */
 static void
 zfs_rename_unlock(zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t *zl;
 
 	while ((zl = *zlpp) != NULL) {
 		if (zl->zl_znode != NULL)
 			zfs_zrele_async(zl->zl_znode);
 		rw_exit(zl->zl_rwlock);
 		*zlpp = zl->zl_next;
 		kmem_free(zl, sizeof (*zl));
 	}
 }
 
 /*
  * Search back through the directory tree, using the ".." entries.
  * Lock each directory in the chain to prevent concurrent renames.
  * Fail any attempt to move a directory into one of its own descendants.
  * XXX - z_parent_lock can overlap with map or grow locks
  */
 static int
 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
 {
 	zfs_zlock_t	*zl;
 	znode_t		*zp = tdzp;
 	uint64_t	rootid = ZTOZSB(zp)->z_root;
 	uint64_t	oidp = zp->z_id;
 	krwlock_t	*rwlp = &szp->z_parent_lock;
 	krw_t		rw = RW_WRITER;
 
 	/*
 	 * First pass write-locks szp and compares to zp->z_id.
 	 * Later passes read-lock zp and compare to zp->z_parent.
 	 */
 	do {
 		if (!rw_tryenter(rwlp, rw)) {
 			/*
 			 * Another thread is renaming in this path.
 			 * Note that if we are a WRITER, we don't have any
 			 * parent_locks held yet.
 			 */
 			if (rw == RW_READER && zp->z_id > szp->z_id) {
 				/*
 				 * Drop our locks and restart
 				 */
 				zfs_rename_unlock(&zl);
 				*zlpp = NULL;
 				zp = tdzp;
 				oidp = zp->z_id;
 				rwlp = &szp->z_parent_lock;
 				rw = RW_WRITER;
 				continue;
 			} else {
 				/*
 				 * Wait for other thread to drop its locks
 				 */
 				rw_enter(rwlp, rw);
 			}
 		}
 
 		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
 		zl->zl_rwlock = rwlp;
 		zl->zl_znode = NULL;
 		zl->zl_next = *zlpp;
 		*zlpp = zl;
 
 		if (oidp == szp->z_id)		/* We're a descendant of szp */
 			return (SET_ERROR(EINVAL));
 
 		if (oidp == rootid)		/* We've hit the top */
 			return (0);
 
 		if (rw == RW_READER) {		/* i.e. not the first pass */
 			int error = zfs_zget(ZTOZSB(zp), oidp, &zp);
 			if (error)
 				return (error);
 			zl->zl_znode = zp;
 		}
 		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)),
 		    &oidp, sizeof (oidp));
 		rwlp = &zp->z_parent_lock;
 		rw = RW_READER;
 
 	} while (zp->z_id != sdzp->z_id);
 
 	return (0);
 }
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdzp	- Source directory containing the "old entry".
  *		snm	- Old entry name.
  *		tdzp	- Target directory to contain the "new entry".
  *		tnm	- New entry name.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *		rflags  - RENAME_* flags
  *		wa_vap  - attributes for RENAME_WHITEOUT (must be a char 0:0).
  *		mnt_ns	- user namespace of the mount
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdzp,tdzp - ctime|mtime updated
  */
 int
 zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm,
     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
 {
 	znode_t		*szp, *tzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(sdzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*sdl, *tdl;
 	dmu_tx_t	*tx;
 	zfs_zlock_t	*zl;
 	int		cmp, serr, terr;
 	int		error = 0;
 	int		zflg = 0;
 	boolean_t	waited = B_FALSE;
 	/* Needed for whiteout inode creation. */
 	boolean_t	fuid_dirtied;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	have_acl = B_FALSE;
 	znode_t		*wzp = NULL;
 
 
 	if (snm == NULL || tnm == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return (SET_ERROR(EINVAL));
 
 	/* Already checked by Linux VFS, but just to make sure. */
 	if (rflags & RENAME_EXCHANGE &&
 	    (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT)))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the
 	 * right kind of vattr_t for the whiteout file. These are set
 	 * internally by ZFS so should never be incorrect.
 	 */
 	VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL);
 	VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR);
 	VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if ((error = zfs_verify_zp(tdzp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * We check i_sb because snapshots and the ctldir must have different
 	 * super blocks.
 	 */
 	if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb ||
 	    zfsctl_is_node(ZTOI(tdzp))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 top:
 	szp = NULL;
 	tzp = NULL;
 	zl = NULL;
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Lock source and target directory entries.  To prevent deadlock,
 	 * a lock ordering must be defined.  We lock the directory with
 	 * the smallest object id first, or if it's a tie, the one with
 	 * the lexically first name.
 	 */
 	if (sdzp->z_id < tdzp->z_id) {
 		cmp = -1;
 	} else if (sdzp->z_id > tdzp->z_id) {
 		cmp = 1;
 	} else {
 		/*
 		 * First compare the two name arguments without
 		 * considering any case folding.
 		 */
 		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
 
 		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
 		ASSERT(error == 0 || !zfsvfs->z_utf8);
 		if (cmp == 0) {
 			/*
 			 * POSIX: "If the old argument and the new argument
 			 * both refer to links to the same existing file,
 			 * the rename() function shall return successfully
 			 * and perform no other action."
 			 */
 			zfs_exit(zfsvfs, FTAG);
 			return (0);
 		}
 		/*
 		 * If the file system is case-folding, then we may
 		 * have some more checking to do.  A case-folding file
 		 * system is either supporting mixed case sensitivity
 		 * access or is completely case-insensitive.  Note
 		 * that the file system is always case preserving.
 		 *
 		 * In mixed sensitivity mode case sensitive behavior
 		 * is the default.  FIGNORECASE must be used to
 		 * explicitly request case insensitive behavior.
 		 *
 		 * If the source and target names provided differ only
 		 * by case (e.g., a request to rename 'tim' to 'Tim'),
 		 * we will treat this as a special case in the
 		 * case-insensitive mode: as long as the source name
 		 * is an exact match, we will allow this to proceed as
 		 * a name-change request.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 		    flags & FIGNORECASE)) &&
 		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
 		    &error) == 0) {
 			/*
 			 * case preserving rename request, require exact
 			 * name matches
 			 */
 			zflg |= ZCIEXACT;
 			zflg &= ~ZCILOOK;
 		}
 	}
 
 	/*
 	 * If the source and destination directories are the same, we should
 	 * grab the z_name_lock of that directory only once.
 	 */
 	if (sdzp == tdzp) {
 		zflg |= ZHAVELOCK;
 		rw_enter(&sdzp->z_name_lock, RW_READER);
 	}
 
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
 	} else {
 		terr = zfs_dirent_lock(&tdl,
 		    tdzp, tnm, &tzp, zflg, NULL, NULL);
 		serr = zfs_dirent_lock(&sdl,
 		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
 		    NULL, NULL);
 	}
 
 	if (serr) {
 		/*
 		 * Source entry invalid or not there.
 		 */
 		if (!terr) {
 			zfs_dirent_unlock(tdl);
 			if (tzp)
 				zrele(tzp);
 		}
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		zfs_exit(zfsvfs, FTAG);
 		return (serr);
 	}
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		zrele(szp);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		zfs_exit(zfsvfs, FTAG);
 		return (terr);
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow renames into our tree when the project
 	 * IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns)))
 		goto out;
 
 	if (S_ISDIR(ZTOI(szp)->i_mode)) {
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl)))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		if (rflags & RENAME_NOREPLACE) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 		/*
 		 * Source and target must be the same type (unless exchanging).
 		 */
 		if (!(rflags & RENAME_EXCHANGE)) {
 			boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0;
 			boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0;
 
 			if (s_is_dir != t_is_dir) {
 				error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR);
 				goto out;
 			}
 		}
 		/*
 		 * POSIX dictates that when the source and target
 		 * entries refer to the same file object, rename
 		 * must do nothing and exit without error.
 		 */
 		if (szp->z_id == tzp->z_id) {
 			error = 0;
 			goto out;
 		}
 	} else if (rflags & RENAME_EXCHANGE) {
 		/* Target must exist for RENAME_EXCHANGE. */
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/* Set up inode creation for RENAME_WHITEOUT. */
 	if (rflags & RENAME_WHITEOUT) {
 		/*
 		 * Whiteout files are not regular files or directories, so to
 		 * match zfs_create() we do not inherit the project id.
 		 */
 		uint64_t wo_projid = ZFS_DEFAULT_PROJID;
 
 		error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns);
 		if (error)
 			goto out;
 
 		if (!have_acl) {
 			error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL,
 			    &acl_ids, mnt_ns);
 			if (error)
 				goto out;
 			have_acl = B_TRUE;
 		}
 
 		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) {
 			error = SET_ERROR(EDQUOT);
 			goto out;
 		}
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id,
 	    (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 	if (rflags & RENAME_WHITEOUT) {
 		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 		    ZFS_SA_BASE_ATTR_SIZE);
 
 		dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm);
 		dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 		if (!zfsvfs->z_use_sa &&
 		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
 	}
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
 
 		if (sdzp == tdzp)
 			rw_exit(&sdzp->z_name_lock);
 
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			zrele(szp);
 			if (tzp)
 				zrele(tzp);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		zrele(szp);
 		if (tzp)
 			zrele(tzp);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Unlink the source.
 	 */
 	szp->z_pflags |= ZFS_AV_MODIFIED;
 	if (tdzp->z_pflags & ZFS_PROJINHERIT)
 		szp->z_pflags |= ZFS_PROJINHERIT;
 
 	error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 	    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 	VERIFY0(error);
 
 	error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
 	if (error)
 		goto commit;
 
 	/*
 	 * Unlink the target.
 	 */
 	if (tzp) {
 		int tzflg = zflg;
 
 		if (rflags & RENAME_EXCHANGE) {
 			/* This inode will be re-linked soon. */
 			tzflg |= ZRENAMING;
 
 			tzp->z_pflags |= ZFS_AV_MODIFIED;
 			if (sdzp->z_pflags & ZFS_PROJINHERIT)
 				tzp->z_pflags |= ZFS_PROJINHERIT;
 
 			error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&tzp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 		}
 		error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL);
 		if (error)
 			goto commit_link_szp;
 	}
 
 	/*
 	 * Create the new target links:
 	 *   * We always link the target.
 	 *   * RENAME_EXCHANGE: Link the old target to the source.
 	 *   * RENAME_WHITEOUT: Create a whiteout inode in-place of the source.
 	 */
 	error = zfs_link_create(tdl, szp, tx, ZRENAMING);
 	if (error) {
 		/*
 		 * If we have removed the existing target, a subsequent call to
 		 * zfs_link_create() to add back the same entry, but with a new
 		 * dnode (szp), should not fail.
 		 */
 		ASSERT3P(tzp, ==, NULL);
 		goto commit_link_tzp;
 	}
 
 	switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
 	case RENAME_EXCHANGE:
 		error = zfs_link_create(sdl, tzp, tx, ZRENAMING);
 		/*
 		 * The same argument as zfs_link_create() failing for
 		 * szp applies here, since the source directory must
 		 * have had an entry we are replacing.
 		 */
 		ASSERT0(error);
 		if (error)
 			goto commit_unlink_td_szp;
 		break;
 	case RENAME_WHITEOUT:
 		zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids);
 		error = zfs_link_create(sdl, wzp, tx, ZNEW);
 		if (error) {
 			zfs_znode_delete(wzp, tx);
 			remove_inode_hash(ZTOI(wzp));
 			goto commit_unlink_td_szp;
 		}
 		break;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) {
 	case RENAME_EXCHANGE:
 		zfs_log_rename_exchange(zilog, tx,
 		    (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
 		    tdzp, tdl->dl_name, szp);
 		break;
 	case RENAME_WHITEOUT:
 		zfs_log_rename_whiteout(zilog, tx,
 		    (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name,
 		    tdzp, tdl->dl_name, szp, wzp);
 		break;
 	default:
 		ASSERT0(rflags & ~RENAME_NOREPLACE);
 		zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0),
 		    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
 		break;
 	}
 
 commit:
 	dmu_tx_commit(tx);
 out:
 	if (have_acl)
 		zfs_acl_ids_free(&acl_ids);
 
 	zfs_znode_update_vfs(sdzp);
 	if (sdzp == tdzp)
 		rw_exit(&sdzp->z_name_lock);
 
 	if (sdzp != tdzp)
 		zfs_znode_update_vfs(tdzp);
 
 	zfs_znode_update_vfs(szp);
 	zrele(szp);
 	if (wzp) {
 		zfs_znode_update_vfs(wzp);
 		zrele(wzp);
 	}
 	if (tzp) {
 		zfs_znode_update_vfs(tzp);
 		zrele(tzp);
 	}
 
 	if (zl != NULL)
 		zfs_rename_unlock(&zl);
 
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
 	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 
 	/*
 	 * Clean-up path for broken link state.
 	 *
 	 * At this point we are in a (very) bad state, so we need to do our
 	 * best to correct the state. In particular, all of the nlinks are
 	 * wrong because we were destroying and creating links with ZRENAMING.
 	 *
 	 * In some form, all of these operations have to resolve the state:
 	 *
 	 *  * link_destroy() *must* succeed. Fortunately, this is very likely
 	 *    since we only just created it.
 	 *
 	 *  * link_create()s are allowed to fail (though they shouldn't because
 	 *    we only just unlinked them and are putting the entries back
 	 *    during clean-up). But if they fail, we can just forcefully drop
 	 *    the nlink value to (at the very least) avoid broken nlink values
 	 *    -- though in the case of non-empty directories we will have to
 	 *    panic (otherwise we'd have a leaked directory with a broken ..).
 	 */
 commit_unlink_td_szp:
 	VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL));
 commit_link_tzp:
 	if (tzp) {
 		if (zfs_link_create(tdl, tzp, tx, ZRENAMING))
 			VERIFY0(zfs_drop_nlink(tzp, tx, NULL));
 	}
 commit_link_szp:
 	if (zfs_link_create(sdl, szp, tx, ZRENAMING))
 		VERIFY0(zfs_drop_nlink(szp, tx, NULL));
 	goto commit;
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dzp	- Directory to contain new symbolic link.
  *		name	- Name of directory entry in dip.
  *		vap	- Attributes of new entry.
  *		link	- Name for new symlink entry.
  *		cr	- credentials of caller.
  *		flags	- case flags
  *		mnt_ns	- user namespace of the mount
  *
  *	OUT:	zpp	- Znode for new symbolic link.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dip - ctime|mtime updated
  */
 int
 zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link,
     znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
 {
 	znode_t		*zp;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 	boolean_t	waited = B_FALSE;
 
 	ASSERT(S_ISLNK(vap->va_mode));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zflg |= ZCILOOK;
 
 	if (len > MAXPATHLEN) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids, mnt_ns)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 top:
 	*zpp = NULL;
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datasets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    link, len, tx);
 	else
 		zfs_sa_symlink(zp, link, len, tx);
 	mutex_exit(&zp->z_lock);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	error = zfs_link_create(dl, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		remove_inode_hash(ZTOI(zp));
 	} else {
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 
 		zfs_znode_update_vfs(dzp);
 		zfs_znode_update_vfs(zp);
 	}
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (error == 0) {
 		*zpp = zp;
 
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 			zil_commit(zilog, 0);
 	} else {
 		zrele(zp);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by ip.
  *
  *	IN:	ip	- inode of symbolic link
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - atime updated
  */
 int
 zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr)
 {
 	(void) cr;
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	mutex_enter(&zp->z_lock);
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 	mutex_exit(&zp->z_lock);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdzp referencing szp.
  *
  *	IN:	tdzp	- Directory to contain new entry.
  *		szp	- znode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *		flags	- case flags.
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	tdzp - ctime|mtime updated
  *	 szp - ctime updated
  */
 int
 zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr,
     int flags)
 {
 	struct inode *sip = ZTOI(szp);
 	znode_t		*tzp;
 	zfsvfs_t	*zfsvfs = ZTOZSB(tdzp);
 	zilog_t		*zilog;
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
 	int		zf = ZNEW;
 	uint64_t	parent;
 	uid_t		owner;
 	boolean_t	waited = B_FALSE;
 	boolean_t	is_tmpfile = 0;
 	uint64_t	txg;
 #ifdef HAVE_TMPFILE
 	is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE));
 #endif
 	ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode));
 
 	if (name == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (S_ISDIR(sip->i_mode)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_verify_zp(szp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow hard link creation in our tree when the
 	 * project IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/*
 	 * We check i_sb because snapshots and the ctldir must have different
 	 * super blocks.
 	 */
 	if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 	if (flags & FIGNORECASE)
 		zf |= ZCILOOK;
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid),
 	    cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr,
 	    zfs_init_idmap))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 top:
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
 	if (is_tmpfile)
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, tdzp);
 	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
 			waited = B_TRUE;
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	/* unmark z_unlinked so zfs_link_create will not reject */
 	if (is_tmpfile)
 		szp->z_unlinked = B_FALSE;
 	error = zfs_link_create(dl, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		/*
 		 * tmpfile is created to be in z_unlinkedobj, so remove it.
 		 * Also, we don't log in ZIL, because all previous file
 		 * operation on the tmpfile are ignored by ZIL. Instead we
 		 * always wait for txg to sync to make sure all previous
 		 * operation are sync safe.
 		 */
 		if (is_tmpfile) {
 			VERIFY(zap_remove_int(zfsvfs->z_os,
 			    zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0);
 		} else {
 			if (flags & FIGNORECASE)
 				txtype |= TX_CI;
 			zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
 		}
 	} else if (is_tmpfile) {
 		/* restore z_unlinked since when linking failed */
 		szp->z_unlinked = B_TRUE;
 	}
 	txg = dmu_tx_get_txg(tx);
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
 
 	if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		zil_commit(zilog, 0);
 
 	if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED)
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg);
 
 	zfs_znode_update_vfs(tdzp);
 	zfs_znode_update_vfs(szp);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 static void
 zfs_putpage_sync_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
 }
 
 static void
 zfs_putpage_async_commit_cb(void *arg)
 {
 	struct page *pp = arg;
 	znode_t *zp = ITOZ(pp->mapping->host);
 
 	ClearPageError(pp);
 	end_page_writeback(pp);
 	atomic_dec_32(&zp->z_async_writes_cnt);
 }
 
 /*
  * Push a page out to disk, once the page is on stable storage the
  * registered commit callback will be run as notification of completion.
  *
  *	IN:	ip	 - page mapped for inode.
  *		pp	 - page to push (page is locked)
  *		wbc	 - writeback control data
  *		for_sync - does the caller intend to wait synchronously for the
  *			   page writeback to complete?
  *
  *	RETURN:	0 if success
  *		error code if failure
  *
  * Timestamps:
  *	ip - ctime|mtime updated
  */
 int
 zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
     boolean_t for_sync)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	loff_t		offset;
 	loff_t		pgoff;
 	unsigned int	pglen;
 	dmu_tx_t	*tx;
 	caddr_t		va;
 	int		err = 0;
 	uint64_t	mtime[2], ctime[2];
 	inode_timespec_t tmp_ts;
 	sa_bulk_attr_t	bulk[3];
 	int		cnt = 0;
 	struct address_space *mapping;
 
 	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (err);
 
 	ASSERT(PageLocked(pp));
 
 	pgoff = page_offset(pp);	/* Page byte-offset in file */
 	offset = i_size_read(ip);	/* File length in bytes */
 	pglen = MIN(PAGE_SIZE,		/* Page length in bytes */
 	    P2ROUNDUP(offset, PAGE_SIZE)-pgoff);
 
 	/* Page is beyond end of file */
 	if (pgoff >= offset) {
 		unlock_page(pp);
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/* Truncate page length to end of file */
 	if (pgoff + pglen > offset)
 		pglen = offset - pgoff;
 
 #if 0
 	/*
 	 * FIXME: Allow mmap writes past its quota.  The correct fix
 	 * is to register a page_mkwrite() handler to count the page
 	 * against its quota when it is about to be dirtied.
 	 */
 	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT,
 	    KUID_TO_SUID(ip->i_uid)) ||
 	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 	    KGID_TO_SGID(ip->i_gid)) ||
 	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
 	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 	    zp->z_projid))) {
 		err = EDQUOT;
 	}
 #endif
 
 	/*
 	 * The ordering here is critical and must adhere to the following
 	 * rules in order to avoid deadlocking in either zfs_read() or
 	 * zfs_free_range() due to a lock inversion.
 	 *
 	 * 1) The page must be unlocked prior to acquiring the range lock.
 	 *    This is critical because zfs_read() calls find_lock_page()
 	 *    which may block on the page lock while holding the range lock.
 	 *
 	 * 2) Before setting or clearing write back on a page the range lock
 	 *    must be held in order to prevent a lock inversion with the
 	 *    zfs_free_range() function.
 	 *
 	 * This presents a problem because upon entering this function the
 	 * page lock is already held.  To safely acquire the range lock the
 	 * page lock must be dropped.  This creates a window where another
 	 * process could truncate, invalidate, dirty, or write out the page.
 	 *
 	 * Therefore, after successfully reacquiring the range and page locks
 	 * the current page state is checked.  In the common case everything
 	 * will be as is expected and it can be written out.  However, if
 	 * the page state has changed it must be handled accordingly.
 	 */
 	mapping = pp->mapping;
 	redirty_page_for_writepage(wbc, pp);
 	unlock_page(pp);
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock,
 	    pgoff, pglen, RL_WRITER);
 	lock_page(pp);
 
 	/* Page mapping changed or it was no longer dirty, we're done */
 	if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) {
 		unlock_page(pp);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/* Another process started write block if required */
 	if (PageWriteback(pp)) {
 		unlock_page(pp);
 		zfs_rangelock_exit(lr);
 
 		if (wbc->sync_mode != WB_SYNC_NONE) {
 			/*
 			 * Speed up any non-sync page writebacks since
 			 * they may take several seconds to complete.
 			 * Refer to the comment in zpl_fsync() (when
 			 * HAVE_FSYNC_RANGE is defined) for details.
 			 */
 			if (atomic_load_32(&zp->z_async_writes_cnt) > 0) {
 				zil_commit(zfsvfs->z_log, zp->z_id);
 			}
 
 			if (PageWriteback(pp))
 #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT
 				folio_wait_bit(page_folio(pp), PG_writeback);
 #else
 				wait_on_page_bit(pp, PG_writeback);
 #endif
 		}
 
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/* Clear the dirty flag the required locks are held */
 	if (!clear_page_dirty_for_io(pp)) {
 		unlock_page(pp);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	/*
 	 * Counterpart for redirty_page_for_writepage() above.  This page
 	 * was in fact not skipped and should not be counted as if it were.
 	 */
 	wbc->pages_skipped--;
 	if (!for_sync)
 		atomic_inc_32(&zp->z_async_writes_cnt);
 	set_page_writeback(pp);
 	unlock_page(pp);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO
 		filemap_dirty_folio(page_mapping(pp), page_folio(pp));
 #else
 		__set_page_dirty_nobuffers(pp);
 #endif
 		ClearPageError(pp);
 		end_page_writeback(pp);
 		if (!for_sync)
 			atomic_dec_32(&zp->z_async_writes_cnt);
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (err);
 	}
 
 	va = kmap(pp);
 	ASSERT3U(pglen, <=, PAGE_SIZE);
 	dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx);
 	kunmap(pp);
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, 8);
 
 	/* Preserve the mtime and ctime provided by the inode */
 	tmp_ts = zpl_inode_get_mtime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, mtime);
 	tmp_ts = zpl_inode_get_ctime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, ctime);
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_seq++;
 
 	err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 
 	zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0,
 	    for_sync ? zfs_putpage_sync_commit_cb :
 	    zfs_putpage_async_commit_cb, pp);
 
 	dmu_tx_commit(tx);
 
 	zfs_rangelock_exit(lr);
 
 	if (wbc->sync_mode != WB_SYNC_NONE) {
 		/*
 		 * Note that this is rarely called under writepages(), because
 		 * writepages() normally handles the entire commit for
 		 * performance reasons.
 		 */
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	} else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) {
 		/*
 		 * If the caller does not intend to wait synchronously
 		 * for this page writeback to complete and there are active
 		 * synchronous calls on this file, do a commit so that
 		 * the latter don't accidentally end up waiting for
 		 * our writeback to complete. Refer to the comment in
 		 * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details.
 		 */
 		zil_commit(zfsvfs->z_log, zp->z_id);
 	}
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
 }
 
 /*
  * Update the system attributes when the inode has been dirtied.  For the
  * moment we only update the mode, atime, mtime, and ctime.
  */
 int
 zfs_dirty_inode(struct inode *ip, int flags)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	dmu_tx_t	*tx;
 	uint64_t	mode, atime[2], mtime[2], ctime[2];
 	inode_timespec_t tmp_ts;
 	sa_bulk_attr_t	bulk[4];
 	int		error = 0;
 	int		cnt = 0;
 
 	if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os))
 		return (0);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 #ifdef I_DIRTY_TIME
 	/*
 	 * This is the lazytime semantic introduced in Linux 4.0
 	 * This flag will only be called from update_time when lazytime is set.
 	 * (Note, I_DIRTY_SYNC will also set if not lazytime)
 	 * Fortunately mtime and ctime are managed within ZFS itself, so we
 	 * only need to dirty atime.
 	 */
 	if (flags == I_DIRTY_TIME) {
 		zp->z_atime_dirty = B_TRUE;
 		goto out;
 	}
 #endif
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	mutex_enter(&zp->z_lock);
 	zp->z_atime_dirty = B_FALSE;
 
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 
 	/* Preserve the mode, mtime and ctime provided by the inode */
 	tmp_ts = zpl_inode_get_atime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, atime);
 	tmp_ts = zpl_inode_get_mtime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, mtime);
 	tmp_ts = zpl_inode_get_ctime(ip);
 	ZFS_TIME_ENCODE(&tmp_ts, ctime);
 	mode = ip->i_mode;
 
 	zp->z_mode = mode;
 
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx);
 	mutex_exit(&zp->z_lock);
 
 	dmu_tx_commit(tx);
 out:
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 void
 zfs_inactive(struct inode *ip)
 {
 	znode_t	*zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	uint64_t atime[2];
 	int error;
 	int need_unlock = 0;
 
 	/* Only read lock if we haven't already write locked, e.g. rollback */
 	if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) {
 		need_unlock = 1;
 		rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
 	}
 	if (zp->z_sa_hdl == NULL) {
 		if (need_unlock)
 			rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			inode_timespec_t tmp_atime;
 			tmp_atime = zpl_inode_get_atime(ip);
 			ZFS_TIME_ENCODE(&tmp_atime, atime);
 			mutex_enter(&zp->z_lock);
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&atime, sizeof (atime), tx);
 			zp->z_atime_dirty = B_FALSE;
 			mutex_exit(&zp->z_lock);
 			dmu_tx_commit(tx);
 		}
 	}
 
 	zfs_zinactive(zp);
 	if (need_unlock)
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 }
 
 /*
  * Fill pages with data from the disk.
  */
 static int
 zfs_fillpage(struct inode *ip, struct page *pp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	loff_t i_size = i_size_read(ip);
 	u_offset_t io_off = page_offset(pp);
 	size_t io_len = PAGE_SIZE;
 
 	ASSERT3U(io_off, <, i_size);
 
 	if (io_off + io_len > i_size)
 		io_len = i_size - io_off;
 
 	void *va = kmap(pp);
 	int error = dmu_read(zfsvfs->z_os, ITOZ(ip)->z_id, io_off,
 	    io_len, va, DMU_READ_PREFETCH);
 	if (io_len != PAGE_SIZE)
 		memset((char *)va + io_len, 0, PAGE_SIZE - io_len);
 	kunmap(pp);
 
 	if (error) {
 		/* convert checksum errors into IO errors */
 		if (error == ECKSUM)
 			error = SET_ERROR(EIO);
 
 		SetPageError(pp);
 		ClearPageUptodate(pp);
 	} else {
 		ClearPageError(pp);
 		SetPageUptodate(pp);
 	}
 
 	return (error);
 }
 
 /*
  * Uses zfs_fillpage to read data from the file and fill the page.
  *
  *	IN:	ip	 - inode of file to get data from.
  *		pp	 - page to read
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 int
 zfs_getpage(struct inode *ip, struct page *pp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	znode_t *zp = ITOZ(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	error = zfs_fillpage(ip, pp);
 	if (error == 0)
 		dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Check ZFS specific permissions to memory map a section of a file.
  *
  *	IN:	ip	- inode of the file to mmap
  *		off	- file offset
  *		addrp	- start address in memory region
  *		len	- length of memory region
  *		vm_flags- address flags
  *
  *	RETURN:	0 if success
  *		error code if failure
  */
 int
 zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len,
     unsigned long vm_flags)
 {
 	(void) addrp;
 	znode_t  *zp = ITOZ(ip);
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) &&
 	    (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((vm_flags & (VM_READ | VM_EXEC)) &&
 	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EACCES));
 	}
 
 	if (off < 0 || len > MAXOFFSET_T - off) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENXIO));
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	zp	- znode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	zp - ctime|mtime updated
  */
 int
 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr)
 {
 	(void) offset;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	uint64_t	off, len;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (cmd != F_FREESP) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	if (bfp->l_len < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Permissions aren't checked on Solaris because on this OS
 	 * zfs_space() can only be called with an opened file handle.
 	 * On Linux we can get here through truncate_range() which
 	 * operates directly on inodes, so we need to check access rights.
 	 */
 	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
 	    zfs_init_idmap))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 int
 zfs_fid(struct inode *ip, fid_t *fidp)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (fidp->fid_len < SHORT_FID_LEN) {
 		fidp->fid_len = SHORT_FID_LEN;
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = SHORT_FID_LEN;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(zfs_open);
 EXPORT_SYMBOL(zfs_close);
 EXPORT_SYMBOL(zfs_lookup);
 EXPORT_SYMBOL(zfs_create);
 EXPORT_SYMBOL(zfs_tmpfile);
 EXPORT_SYMBOL(zfs_remove);
 EXPORT_SYMBOL(zfs_mkdir);
 EXPORT_SYMBOL(zfs_rmdir);
 EXPORT_SYMBOL(zfs_readdir);
 EXPORT_SYMBOL(zfs_getattr_fast);
 EXPORT_SYMBOL(zfs_setattr);
 EXPORT_SYMBOL(zfs_rename);
 EXPORT_SYMBOL(zfs_symlink);
 EXPORT_SYMBOL(zfs_readlink);
 EXPORT_SYMBOL(zfs_link);
 EXPORT_SYMBOL(zfs_inactive);
 EXPORT_SYMBOL(zfs_space);
 EXPORT_SYMBOL(zfs_fid);
 EXPORT_SYMBOL(zfs_getpage);
 EXPORT_SYMBOL(zfs_putpage);
 EXPORT_SYMBOL(zfs_dirty_inode);
 EXPORT_SYMBOL(zfs_map);
 
 /* CSTYLED */
 module_param(zfs_delete_blocks, ulong, 0644);
 MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async");
 #endif
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
index 129e3606bb8d..928a222f1505 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c
@@ -1,1902 +1,1945 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
+ * Copyright (c) 2024, Rob Norris <robn@despairlabs.com>
  */
 
 #include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
 #include <cityhash.h>
 
 #include <linux/blkdev_compat.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/workqueue.h>
 
 #ifdef HAVE_BLK_MQ
 #include <linux/blk-mq.h>
 #endif
 
 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio,
     struct request *rq, boolean_t force_sync);
 
 static unsigned int zvol_major = ZVOL_MAJOR;
 static unsigned int zvol_request_sync = 0;
 static unsigned int zvol_prefetch_bytes = (128 * 1024);
 static unsigned long zvol_max_discard_blocks = 16384;
 
 /*
  * Switch taskq at multiple of 512 MB offset. This can be set to a lower value
  * to utilize more threads for small files but may affect prefetch hits.
  */
 #define	ZVOL_TASKQ_OFFSET_SHIFT 29
 
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 static unsigned int zvol_open_timeout_ms = 1000;
 #endif
 
 static unsigned int zvol_threads = 0;
 #ifdef HAVE_BLK_MQ
 static unsigned int zvol_blk_mq_threads = 0;
 static unsigned int zvol_blk_mq_actual_threads;
 static boolean_t zvol_use_blk_mq = B_FALSE;
 
 /*
  * The maximum number of volblocksize blocks to process per thread.  Typically,
  * write heavy workloads preform better with higher values here, and read
  * heavy workloads preform better with lower values, but that's not a hard
  * and fast rule.  It's basically a knob to tune between "less overhead with
  * less parallelism" and "more overhead, but more parallelism".
  *
  * '8' was chosen as a reasonable, balanced, default based off of sequential
  * read and write tests to a zvol in an NVMe pool (with 16 CPUs).
  */
 static unsigned int zvol_blk_mq_blocks_per_thread = 8;
 #endif
 
 static unsigned int zvol_num_taskqs = 0;
 
 #ifndef	BLKDEV_DEFAULT_RQ
 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */
 #define	BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ
 #endif
 
 /*
  * Finalize our BIO or request.
  */
 #ifdef	HAVE_BLK_MQ
 #define	END_IO(zv, bio, rq, error)  do { \
 	if (bio) { \
 		BIO_END_IO(bio, error); \
 	} else { \
 		blk_mq_end_request(rq, errno_to_bi_status(error)); \
 	} \
 } while (0)
 #else
 #define	END_IO(zv, bio, rq, error)	BIO_END_IO(bio, error)
 #endif
 
 #ifdef HAVE_BLK_MQ
 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 static unsigned int zvol_actual_blk_mq_queue_depth;
 #endif
 
 struct zvol_state_os {
 	struct gendisk		*zvo_disk;	/* generic disk */
 	struct request_queue	*zvo_queue;	/* request queue */
 	dev_t			zvo_dev;	/* device id */
 
 #ifdef HAVE_BLK_MQ
 	struct blk_mq_tag_set tag_set;
 #endif
 
 	/* Set from the global 'zvol_use_blk_mq' at zvol load */
 	boolean_t use_blk_mq;
 };
 
 typedef struct zv_taskq {
 	uint_t tqs_cnt;
 	taskq_t **tqs_taskq;
 } zv_taskq_t;
 static zv_taskq_t zvol_taskqs;
 static struct ida zvol_ida;
 
 typedef struct zv_request_stack {
 	zvol_state_t	*zv;
 	struct bio	*bio;
 	struct request *rq;
 } zv_request_t;
 
 typedef struct zv_work {
 	struct request  *rq;
 	struct work_struct work;
 } zv_work_t;
 
 typedef struct zv_request_task {
 	zv_request_t zvr;
 	taskq_ent_t	ent;
 } zv_request_task_t;
 
 static zv_request_task_t *
 zv_request_task_create(zv_request_t zvr)
 {
 	zv_request_task_t *task;
 	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
 	taskq_init_ent(&task->ent);
 	task->zvr = zvr;
 	return (task);
 }
 
 static void
 zv_request_task_free(zv_request_task_t *task)
 {
 	kmem_free(task, sizeof (*task));
 }
 
 #ifdef HAVE_BLK_MQ
 
 /*
  * This is called when a new block multiqueue request comes in.  A request
  * contains one or more BIOs.
  */
 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
     const struct blk_mq_queue_data *bd)
 {
 	struct request *rq = bd->rq;
 	zvol_state_t *zv = rq->q->queuedata;
 
 	/* Tell the kernel that we are starting to process this request */
 	blk_mq_start_request(rq);
 
 	if (blk_rq_is_passthrough(rq)) {
 		/* Skip non filesystem request */
 		blk_mq_end_request(rq, BLK_STS_IOERR);
 		return (BLK_STS_IOERR);
 	}
 
 	zvol_request_impl(zv, NULL, rq, 0);
 
 	/* Acknowledge to the kernel that we got this request */
 	return (BLK_STS_OK);
 }
 
 static struct blk_mq_ops zvol_blk_mq_queue_ops = {
 	.queue_rq = zvol_mq_queue_rq,
 };
 
 /* Initialize our blk-mq struct */
 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv)
 {
 	struct zvol_state_os *zso = zv->zv_zso;
 
 	memset(&zso->tag_set, 0, sizeof (zso->tag_set));
 
 	/* Initialize tag set. */
 	zso->tag_set.ops = &zvol_blk_mq_queue_ops;
 	zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads;
 	zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth;
 	zso->tag_set.numa_node = NUMA_NO_NODE;
 	zso->tag_set.cmd_size = 0;
 
 	/*
 	 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in
 	 * zvol_request_impl()
 	 */
 	zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
 	zso->tag_set.driver_data = zv;
 
 	return (blk_mq_alloc_tag_set(&zso->tag_set));
 }
 #endif /* HAVE_BLK_MQ */
 
 /*
  * Given a path, return TRUE if path is a ZVOL.
  */
 boolean_t
 zvol_os_is_zvol(const char *path)
 {
 	dev_t dev = 0;
 
 	if (vdev_lookup_bdev(path, &dev) != 0)
 		return (B_FALSE);
 
 	if (MAJOR(dev) == zvol_major)
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static void
 zvol_write(zv_request_t *zvr)
 {
 	struct bio *bio = zvr->bio;
 	struct request *rq = zvr->rq;
 	int error = 0;
 	zfs_uio_t uio;
 	zvol_state_t *zv = zvr->zv;
 	struct request_queue *q;
 	struct gendisk *disk;
 	unsigned long start_time = 0;
 	boolean_t acct = B_FALSE;
 
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 	ASSERT3P(zv->zv_zilog, !=, NULL);
 
 	q = zv->zv_zso->zvo_queue;
 	disk = zv->zv_zso->zvo_disk;
 
 	/* bio marked as FLUSH need to flush before write */
 	if (io_is_flush(bio, rq))
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	/* Some requests are just for flush and nothing else. */
 	if (io_size(bio, rq) == 0) {
 		rw_exit(&zv->zv_suspend_lock);
 		END_IO(zv, bio, rq, 0);
 		return;
 	}
 
 	zfs_uio_bvec_init(&uio, bio, rq);
 
 	ssize_t start_resid = uio.uio_resid;
 
 	/*
 	 * With use_blk_mq, accounting is done by blk_mq_start_request()
 	 * and blk_mq_end_request(), so we can skip it here.
 	 */
 	if (bio) {
 		acct = blk_queue_io_stat(q);
 		if (acct) {
 			start_time = blk_generic_start_io_acct(q, disk, WRITE,
 			    bio);
 		}
 	}
 
 	boolean_t sync =
 	    io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 	    uio.uio_loffset, uio.uio_resid, RL_WRITER);
 
 	uint64_t volsize = zv->zv_volsize;
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 		uint64_t off = uio.uio_loffset;
 		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 
 		if (bytes > volsize - off)	/* don't write past the end */
 			bytes = volsize - off;
 
 		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 
 		/* This will only fail for ENOSPC */
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			break;
 		}
 		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 		if (error == 0) {
 			zvol_log_write(zv, tx, off, bytes, sync);
 		}
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_rangelock_exit(lr);
 
 	int64_t nwritten = start_resid - uio.uio_resid;
 	dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten);
 	task_io_account_write(nwritten);
 
 	if (sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 	rw_exit(&zv->zv_suspend_lock);
 
 	if (bio && acct) {
 		blk_generic_end_io_acct(q, disk, WRITE, bio, start_time);
 	}
 
 	END_IO(zv, bio, rq, -error);
 }
 
 static void
 zvol_write_task(void *arg)
 {
 	zv_request_task_t *task = arg;
 	zvol_write(&task->zvr);
 	zv_request_task_free(task);
 }
 
 static void
 zvol_discard(zv_request_t *zvr)
 {
 	struct bio *bio = zvr->bio;
 	struct request *rq = zvr->rq;
 	zvol_state_t *zv = zvr->zv;
 	uint64_t start = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
 	uint64_t end = start + size;
 	boolean_t sync;
 	int error = 0;
 	dmu_tx_t *tx;
 	struct request_queue *q = zv->zv_zso->zvo_queue;
 	struct gendisk *disk = zv->zv_zso->zvo_disk;
 	unsigned long start_time = 0;
 	boolean_t acct = B_FALSE;
 
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 	ASSERT3P(zv->zv_zilog, !=, NULL);
 
 	if (bio) {
 		acct = blk_queue_io_stat(q);
 		if (acct) {
 			start_time = blk_generic_start_io_acct(q, disk, WRITE,
 			    bio);
 		}
 	}
 
 	sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 
 	if (end > zv->zv_volsize) {
 		error = SET_ERROR(EIO);
 		goto unlock;
 	}
 
 	/*
 	 * Align the request to volume block boundaries when a secure erase is
 	 * not required.  This will prevent dnode_free_range() from zeroing out
 	 * the unaligned parts which is slow (read-modify-write) and useless
 	 * since we are not freeing any space by doing so.
 	 */
 	if (!io_is_secure_erase(bio, rq)) {
 		start = P2ROUNDUP(start, zv->zv_volblocksize);
 		end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t);
 		size = end - start;
 	}
 
 	if (start >= end)
 		goto unlock;
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 	    start, size, RL_WRITER);
 
 	tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
 		zvol_log_truncate(zv, tx, start, size, B_TRUE);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset,
 		    ZVOL_OBJ, start, size);
 	}
 	zfs_rangelock_exit(lr);
 
 	if (error == 0 && sync)
 		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 
 unlock:
 	rw_exit(&zv->zv_suspend_lock);
 
 	if (bio && acct) {
 		blk_generic_end_io_acct(q, disk, WRITE, bio,
 		    start_time);
 	}
 
 	END_IO(zv, bio, rq, -error);
 }
 
 static void
 zvol_discard_task(void *arg)
 {
 	zv_request_task_t *task = arg;
 	zvol_discard(&task->zvr);
 	zv_request_task_free(task);
 }
 
 static void
 zvol_read(zv_request_t *zvr)
 {
 	struct bio *bio = zvr->bio;
 	struct request *rq = zvr->rq;
 	int error = 0;
 	zfs_uio_t uio;
 	boolean_t acct = B_FALSE;
 	zvol_state_t *zv = zvr->zv;
 	struct request_queue *q;
 	struct gendisk *disk;
 	unsigned long start_time = 0;
 
 	ASSERT3P(zv, !=, NULL);
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	zfs_uio_bvec_init(&uio, bio, rq);
 
 	q = zv->zv_zso->zvo_queue;
 	disk = zv->zv_zso->zvo_disk;
 
 	ssize_t start_resid = uio.uio_resid;
 
 	/*
 	 * When blk-mq is being used, accounting is done by
 	 * blk_mq_start_request() and blk_mq_end_request().
 	 */
 	if (bio) {
 		acct = blk_queue_io_stat(q);
 		if (acct)
 			start_time = blk_generic_start_io_acct(q, disk, READ,
 			    bio);
 	}
 
 	zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock,
 	    uio.uio_loffset, uio.uio_resid, RL_READER);
 
 	uint64_t volsize = zv->zv_volsize;
 
 	while (uio.uio_resid > 0 && uio.uio_loffset < volsize) {
 		uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1);
 
 		/* don't read past the end */
 		if (bytes > volsize - uio.uio_loffset)
 			bytes = volsize - uio.uio_loffset;
 
 		error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 		if (error) {
 			/* convert checksum errors into IO errors */
 			if (error == ECKSUM)
 				error = SET_ERROR(EIO);
 			break;
 		}
 	}
 	zfs_rangelock_exit(lr);
 
 	int64_t nread = start_resid - uio.uio_resid;
 	dataset_kstats_update_read_kstats(&zv->zv_kstat, nread);
 	task_io_account_read(nread);
 
 	rw_exit(&zv->zv_suspend_lock);
 
 	if (bio && acct) {
 		blk_generic_end_io_acct(q, disk, READ, bio, start_time);
 	}
 
 	END_IO(zv, bio, rq, -error);
 }
 
 static void
 zvol_read_task(void *arg)
 {
 	zv_request_task_t *task = arg;
 	zvol_read(&task->zvr);
 	zv_request_task_free(task);
 }
 
 
 /*
  * Process a BIO or request
  *
  * Either 'bio' or 'rq' should be set depending on if we are processing a
  * bio or a request (both should not be set).
  *
  * force_sync:	Set to 0 to defer processing to a background taskq
  *			Set to 1 to process data synchronously
  */
 static void
 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq,
     boolean_t force_sync)
 {
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	uint64_t offset = io_offset(bio, rq);
 	uint64_t size = io_size(bio, rq);
 	int rw = io_data_dir(bio, rq);
 
 	if (zvol_request_sync)
 		force_sync = 1;
 
 	zv_request_t zvr = {
 		.zv = zv,
 		.bio = bio,
 		.rq = rq,
 	};
 
 	if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) {
 		printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n",
 		    zv->zv_zso->zvo_disk->disk_name,
 		    (long long unsigned)offset,
 		    (long unsigned)size);
 
 		END_IO(zv, bio, rq, -SET_ERROR(EIO));
 		goto out;
 	}
 
 	zv_request_task_t *task;
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	uint_t blk_mq_hw_queue = 0;
 	uint_t tq_idx;
 	uint_t taskq_hash;
 #ifdef HAVE_BLK_MQ
 	if (rq)
 #ifdef HAVE_BLK_MQ_RQ_HCTX
 		blk_mq_hw_queue = rq->mq_hctx->queue_num;
 #else
 		blk_mq_hw_queue =
 		    rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num;
 #endif
 #endif
 	taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT,
 	    blk_mq_hw_queue, 0);
 	tq_idx = taskq_hash % ztqs->tqs_cnt;
 
 	if (rw == WRITE) {
 		if (unlikely(zv->zv_flags & ZVOL_RDONLY)) {
 			END_IO(zv, bio, rq, -SET_ERROR(EROFS));
 			goto out;
 		}
 
 		/*
 		 * Prevents the zvol from being suspended, or the ZIL being
 		 * concurrently opened.  Will be released after the i/o
 		 * completes.
 		 */
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		/*
 		 * Open a ZIL if this is the first time we have written to this
 		 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather
 		 * than zv_state_lock so that we don't need to acquire an
 		 * additional lock in this path.
 		 */
 		if (zv->zv_zilog == NULL) {
 			rw_exit(&zv->zv_suspend_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 			if (zv->zv_zilog == NULL) {
 				zv->zv_zilog = zil_open(zv->zv_objset,
 				    zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 				zv->zv_flags |= ZVOL_WRITTEN_TO;
 				/* replay / destroy done in zvol_create_minor */
 				VERIFY0((zv->zv_zilog->zl_header->zh_flags &
 				    ZIL_REPLAY_NEEDED));
 			}
 			rw_downgrade(&zv->zv_suspend_lock);
 		}
 
 		/*
 		 * We don't want this thread to be blocked waiting for i/o to
 		 * complete, so we instead wait from a taskq callback. The
 		 * i/o may be a ZIL write (via zil_commit()), or a read of an
 		 * indirect block, or a read of a data block (if this is a
 		 * partial-block write).  We will indicate that the i/o is
 		 * complete by calling END_IO() from the taskq callback.
 		 *
 		 * This design allows the calling thread to continue and
 		 * initiate more concurrent operations by calling
 		 * zvol_request() again. There are typically only a small
 		 * number of threads available to call zvol_request() (e.g.
 		 * one per iSCSI target), so keeping the latency of
 		 * zvol_request() low is important for performance.
 		 *
 		 * The zvol_request_sync module parameter allows this
 		 * behavior to be altered, for performance evaluation
 		 * purposes.  If the callback blocks, setting
 		 * zvol_request_sync=1 will result in much worse performance.
 		 *
 		 * We can have up to zvol_threads concurrent i/o's being
 		 * processed for all zvols on the system.  This is typically
 		 * a vast improvement over the zvol_request_sync=1 behavior
 		 * of one i/o at a time per zvol.  However, an even better
 		 * design would be for zvol_request() to initiate the zio
 		 * directly, and then be notified by the zio_done callback,
 		 * which would call END_IO().  Unfortunately, the DMU/ZIL
 		 * interfaces lack this functionality (they block waiting for
 		 * the i/o to complete).
 		 */
 		if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) {
 			if (force_sync) {
 				zvol_discard(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
 				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_discard_task, task, 0, &task->ent);
 			}
 		} else {
 			if (force_sync) {
 				zvol_write(&zvr);
 			} else {
 				task = zv_request_task_create(zvr);
 				taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 				    zvol_write_task, task, 0, &task->ent);
 			}
 		}
 	} else {
 		/*
 		 * The SCST driver, and possibly others, may issue READ I/Os
 		 * with a length of zero bytes.  These empty I/Os contain no
 		 * data and require no additional handling.
 		 */
 		if (size == 0) {
 			END_IO(zv, bio, rq, 0);
 			goto out;
 		}
 
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		/* See comment in WRITE case above. */
 		if (force_sync) {
 			zvol_read(&zvr);
 		} else {
 			task = zv_request_task_create(zvr);
 			taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx],
 			    zvol_read_task, task, 0, &task->ent);
 		}
 	}
 
 out:
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID
 static void
 zvol_submit_bio(struct bio *bio)
 #else
 static blk_qc_t
 zvol_submit_bio(struct bio *bio)
 #endif
 #else
 static MAKE_REQUEST_FN_RET
 zvol_request(struct request_queue *q, struct bio *bio)
 #endif
 {
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 #if defined(HAVE_BIO_BDEV_DISK)
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 #else
 	struct request_queue *q = bio->bi_disk->queue;
 #endif
 #endif
 	zvol_state_t *zv = q->queuedata;
 
 	zvol_request_impl(zv, bio, NULL, 0);
 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \
 	defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	!defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID)
 	return (BLK_QC_T_NONE);
 #endif
 }
 
 static int
 #ifdef HAVE_BLK_MODE_T
 zvol_open(struct gendisk *disk, blk_mode_t flag)
 #else
 zvol_open(struct block_device *bdev, fmode_t flag)
 #endif
 {
 	zvol_state_t *zv;
 	int error = 0;
 	boolean_t drop_suspend = B_FALSE;
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 	hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
 	hrtime_t start = gethrtime();
 
 retry:
 #endif
 	rw_enter(&zvol_state_lock, RW_READER);
 	/*
 	 * Obtain a copy of private_data under the zvol_state_lock to make
 	 * sure that either the result of zvol free code path setting
 	 * disk->private_data to NULL is observed, or zvol_os_free()
 	 * is not called on this zv because of the positive zv_open_count.
 	 */
 #ifdef HAVE_BLK_MODE_T
 	zv = disk->private_data;
 #else
 	zv = bdev->bd_disk->private_data;
 #endif
 	if (zv == NULL) {
 		rw_exit(&zvol_state_lock);
-		return (SET_ERROR(-ENXIO));
+		return (-SET_ERROR(ENXIO));
 	}
 
 	mutex_enter(&zv->zv_state_lock);
 	/*
 	 * Make sure zvol is not suspended during first open
 	 * (hold zv_suspend_lock) and respect proper lock acquisition
 	 * ordering - zv_suspend_lock before zv_state_lock
 	 */
 	if (zv->zv_open_count == 0) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 0) {
 				rw_exit(&zv->zv_suspend_lock);
 			} else {
 				drop_suspend = B_TRUE;
 			}
 		} else {
 			drop_suspend = B_TRUE;
 		}
 	}
 	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	if (zv->zv_open_count == 0) {
 		boolean_t drop_namespace = B_FALSE;
 
 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 
 		/*
 		 * In all other call paths the spa_namespace_lock is taken
 		 * before the bdev->bd_mutex lock.  However, on open(2)
 		 * the __blkdev_get() function calls fops->open() with the
 		 * bdev->bd_mutex lock held.  This can result in a deadlock
 		 * when zvols from one pool are used as vdevs in another.
 		 *
 		 * To prevent a lock inversion deadlock we preemptively
 		 * take the spa_namespace_lock.  Normally the lock will not
 		 * be contended and this is safe because spa_open_common()
 		 * handles the case where the caller already holds the
 		 * spa_namespace_lock.
 		 *
 		 * When the lock cannot be aquired after multiple retries
 		 * this must be the vdev on zvol deadlock case and we have
 		 * no choice but to return an error.  For 5.12 and older
 		 * kernels returning -ERESTARTSYS will result in the
 		 * bdev->bd_mutex being dropped, then reacquired, and
 		 * fops->open() being called again.  This process can be
 		 * repeated safely until both locks are acquired.  For 5.13
 		 * and newer the -ERESTARTSYS retry logic was removed from
 		 * the kernel so the only option is to return the error for
 		 * the caller to handle it.
 		 */
 		if (!mutex_owned(&spa_namespace_lock)) {
 			if (!mutex_tryenter(&spa_namespace_lock)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_exit(&zv->zv_suspend_lock);
 
 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
 				schedule();
-				return (SET_ERROR(-ERESTARTSYS));
+				return (-SET_ERROR(ERESTARTSYS));
 #else
 				if ((gethrtime() - start) > timeout)
-					return (SET_ERROR(-ERESTARTSYS));
+					return (-SET_ERROR(ERESTARTSYS));
 
 				schedule_timeout(MSEC_TO_TICK(10));
 				goto retry;
 #endif
 			} else {
 				drop_namespace = B_TRUE;
 			}
 		}
 
 		error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag)));
 
 		if (drop_namespace)
 			mutex_exit(&spa_namespace_lock);
 	}
 
 	if (error == 0) {
 		if ((blk_mode_is_open_write(flag)) &&
 		    (zv->zv_flags & ZVOL_RDONLY)) {
 			if (zv->zv_open_count == 0)
 				zvol_last_close(zv);
 
-			error = SET_ERROR(-EROFS);
+			error = -SET_ERROR(EROFS);
 		} else {
 			zv->zv_open_count++;
 		}
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 
 	if (error == 0)
 #ifdef HAVE_BLK_MODE_T
 		disk_check_media_change(disk);
 #else
 		zfs_check_media_change(bdev);
 #endif
 
 	return (error);
 }
 
 static void
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG
 zvol_release(struct gendisk *disk)
 #else
 zvol_release(struct gendisk *disk, fmode_t unused)
 #endif
 {
 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG)
 	(void) unused;
 #endif
 	zvol_state_t *zv;
 	boolean_t drop_suspend = B_TRUE;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 	zv = disk->private_data;
 
 	mutex_enter(&zv->zv_state_lock);
 	ASSERT3U(zv->zv_open_count, >, 0);
 	/*
 	 * make sure zvol is not suspended during last close
 	 * (hold zv_suspend_lock) and respect proper lock acquisition
 	 * ordering - zv_suspend_lock before zv_state_lock
 	 */
 	if (zv->zv_open_count == 1) {
 		if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_READER);
 			mutex_enter(&zv->zv_state_lock);
 			/* check to see if zv_suspend_lock is needed */
 			if (zv->zv_open_count != 1) {
 				rw_exit(&zv->zv_suspend_lock);
 				drop_suspend = B_FALSE;
 			}
 		}
 	} else {
 		drop_suspend = B_FALSE;
 	}
 	rw_exit(&zvol_state_lock);
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	zv->zv_open_count--;
 	if (zv->zv_open_count == 0) {
 		ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 		zvol_last_close(zv);
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 
 	if (drop_suspend)
 		rw_exit(&zv->zv_suspend_lock);
 }
 
 static int
 zvol_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned int cmd, unsigned long arg)
 {
 	zvol_state_t *zv = bdev->bd_disk->private_data;
 	int error = 0;
 
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	switch (cmd) {
 	case BLKFLSBUF:
 #ifdef HAVE_FSYNC_BDEV
 		fsync_bdev(bdev);
 #elif defined(HAVE_SYNC_BLOCKDEV)
 		sync_blockdev(bdev);
 #else
 #error "Neither fsync_bdev() nor sync_blockdev() found"
 #endif
 		invalidate_bdev(bdev);
 		rw_enter(&zv->zv_suspend_lock, RW_READER);
 
 		if (!(zv->zv_flags & ZVOL_RDONLY))
 			txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
 		rw_exit(&zv->zv_suspend_lock);
 		break;
 
 	case BLKZNAME:
 		mutex_enter(&zv->zv_state_lock);
 		error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN);
 		mutex_exit(&zv->zv_state_lock);
 		break;
 
 	default:
 		error = -ENOTTY;
 		break;
 	}
 
 	return (SET_ERROR(error));
 }
 
 #ifdef CONFIG_COMPAT
 static int
 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode,
     unsigned cmd, unsigned long arg)
 {
 	return (zvol_ioctl(bdev, mode, cmd, arg));
 }
 #else
 #define	zvol_compat_ioctl	NULL
 #endif
 
 static unsigned int
 zvol_check_events(struct gendisk *disk, unsigned int clearing)
 {
 	unsigned int mask = 0;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0;
 		zv->zv_changed = 0;
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 
 	return (mask);
 }
 
 static int
 zvol_revalidate_disk(struct gendisk *disk)
 {
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	zvol_state_t *zv = disk->private_data;
 	if (zv != NULL) {
 		mutex_enter(&zv->zv_state_lock);
 		set_capacity(zv->zv_zso->zvo_disk,
 		    zv->zv_volsize >> SECTOR_BITS);
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 
 	return (0);
 }
 
 int
 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize)
 {
 	struct gendisk *disk = zv->zv_zso->zvo_disk;
 
 #if defined(HAVE_REVALIDATE_DISK_SIZE)
 	revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0);
 #elif defined(HAVE_REVALIDATE_DISK)
 	revalidate_disk(disk);
 #else
 	zvol_revalidate_disk(disk);
 #endif
 	return (0);
 }
 
 void
 zvol_os_clear_private(zvol_state_t *zv)
 {
 	/*
 	 * Cleared while holding zvol_state_lock as a writer
 	 * which will prevent zvol_open() from opening it.
 	 */
 	zv->zv_zso->zvo_disk->private_data = NULL;
 }
 
 /*
  * Provide a simple virtual geometry for legacy compatibility.  For devices
  * smaller than 1 MiB a small head and sector count is used to allow very
  * tiny devices.  For devices over 1 Mib a standard head and sector count
  * is used to keep the cylinders count reasonable.
  */
 static int
 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	zvol_state_t *zv = bdev->bd_disk->private_data;
 	sector_t sectors;
 
 	ASSERT3U(zv->zv_open_count, >, 0);
 
 	sectors = get_capacity(zv->zv_zso->zvo_disk);
 
 	if (sectors > 2048) {
 		geo->heads = 16;
 		geo->sectors = 63;
 	} else {
 		geo->heads = 2;
 		geo->sectors = 4;
 	}
 
 	geo->start = 0;
 	geo->cylinders = sectors / (geo->heads * geo->sectors);
 
 	return (0);
 }
 
 /*
  * Why have two separate block_device_operations structs?
  *
  * Normally we'd just have one, and assign 'submit_bio' as needed.  However,
  * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we
  * can't just change submit_bio dynamically at runtime.  So just create two
  * separate structs to get around this.
  */
 static const struct block_device_operations zvol_ops_blk_mq = {
 	.open			= zvol_open,
 	.release		= zvol_release,
 	.ioctl			= zvol_ioctl,
 	.compat_ioctl		= zvol_compat_ioctl,
 	.check_events		= zvol_check_events,
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 	.revalidate_disk	= zvol_revalidate_disk,
 #endif
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 };
 
 static const struct block_device_operations zvol_ops = {
 	.open			= zvol_open,
 	.release		= zvol_release,
 	.ioctl			= zvol_ioctl,
 	.compat_ioctl		= zvol_compat_ioctl,
 	.check_events		= zvol_check_events,
 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK
 	.revalidate_disk	= zvol_revalidate_disk,
 #endif
 	.getgeo			= zvol_getgeo,
 	.owner			= THIS_MODULE,
 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS
 	.submit_bio		= zvol_submit_bio,
 #endif
 };
 
+/*
+ * Since 6.9, Linux has been removing queue limit setters in favour of an
+ * initial queue_limits struct applied when the device is open. Since 6.11,
+ * queue_limits is being extended to allow more things to be applied when the
+ * device is open. Setters are also being removed for this.
+ *
+ * For OpenZFS, this means that depending on kernel version, some options may
+ * be set up before the device is open, and some applied to an open device
+ * (queue) after the fact.
+ *
+ * We manage this complexity by having our own limits struct,
+ * zvol_queue_limits_t, in which we carry any queue config that we're
+ * interested in setting. This structure is the same on all kernels.
+ *
+ * These limits are then applied to the queue at device open time by the most
+ * appropriate method for the kernel.
+ *
+ * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of
+ * blk_alloc_disk() exists). This converts our limits struct to a proper Linux
+ * struct queue_limits, and passes it in. Any fields added in later kernels are
+ * (obviously) not set up here.
+ *
+ * zvol_queue_limits_apply() is called on all kernel versions after the queue
+ * is created, and applies any remaining config. Before 6.9 that will be
+ * everything, via setter methods. After 6.9 that will be whatever couldn't be
+ * put into struct queue_limits. (This implies that zvol_queue_limits_apply()
+ * will always be a no-op on the latest kernel we support).
+ */
 typedef struct zvol_queue_limits {
 	unsigned int	zql_max_hw_sectors;
 	unsigned short	zql_max_segments;
 	unsigned int	zql_max_segment_size;
 	unsigned int	zql_io_opt;
+	unsigned int	zql_physical_block_size;
+	unsigned int	zql_max_discard_sectors;
+	unsigned int	zql_discard_granularity;
 } zvol_queue_limits_t;
 
 static void
 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv,
     boolean_t use_blk_mq)
 {
 	limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9;
 
 	if (use_blk_mq) {
 		/*
 		 * IO requests can be really big (1MB).  When an IO request
 		 * comes in, it is passed off to zvol_read() or zvol_write()
 		 * in a new thread, where it is chunked up into 'volblocksize'
 		 * sized pieces and processed.  So for example, if the request
 		 * is a 1MB write and your volblocksize is 128k, one zvol_write
 		 * thread will take that request and sequentially do ten 128k
 		 * IOs.  This is due to the fact that the thread needs to lock
 		 * each volblocksize sized block.  So you might be wondering:
 		 * "instead of passing the whole 1MB request to one thread,
 		 * why not pass ten individual 128k chunks to ten threads and
 		 * process the whole write in parallel?"  The short answer is
 		 * that there's a sweet spot number of chunks that balances
 		 * the greater parallelism with the added overhead of more
 		 * threads. The sweet spot can be different depending on if you
 		 * have a read or write  heavy workload.  Writes typically want
 		 * high chunk counts while reads typically want lower ones.  On
 		 * a test pool with 6 NVMe drives in a 3x 2-disk mirror
 		 * configuration, with volblocksize=8k, the sweet spot for good
 		 * sequential reads and writes was at 8 chunks.
 		 */
 
 		/*
 		 * Below we tell the kernel how big we want our requests
 		 * to be.  You would think that blk_queue_io_opt() would be
 		 * used to do this since it is used to "set optimal request
 		 * size for the queue", but that doesn't seem to do
 		 * anything - the kernel still gives you huge requests
 		 * with tons of little PAGE_SIZE segments contained within it.
 		 *
 		 * Knowing that the kernel will just give you PAGE_SIZE segments
 		 * no matter what, you can say "ok, I want PAGE_SIZE byte
 		 * segments, and I want 'N' of them per request", where N is
 		 * the correct number of segments for the volblocksize and
 		 * number of chunks you want.
 		 */
 #ifdef HAVE_BLK_MQ
 		if (zvol_blk_mq_blocks_per_thread != 0) {
 			unsigned int chunks;
 			chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX);
 
 			limits->zql_max_segment_size = PAGE_SIZE;
 			limits->zql_max_segments =
 			    (zv->zv_volblocksize * chunks) / PAGE_SIZE;
 		} else {
 			/*
 			 * Special case: zvol_blk_mq_blocks_per_thread = 0
 			 * Max everything out.
 			 */
 			limits->zql_max_segments = UINT16_MAX;
 			limits->zql_max_segment_size = UINT_MAX;
 		}
 	} else {
 #endif
 		limits->zql_max_segments = UINT16_MAX;
 		limits->zql_max_segment_size = UINT_MAX;
 	}
 
 	limits->zql_io_opt = zv->zv_volblocksize;
+
+	limits->zql_physical_block_size = zv->zv_volblocksize;
+	limits->zql_max_discard_sectors =
+	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9;
+	limits->zql_discard_granularity = zv->zv_volblocksize;
 }
 
 #ifdef HAVE_BLK_ALLOC_DISK_2ARG
 static void
 zvol_queue_limits_convert(zvol_queue_limits_t *limits,
     struct queue_limits *qlimits)
 {
 	memset(qlimits, 0, sizeof (struct queue_limits));
 	qlimits->max_hw_sectors = limits->zql_max_hw_sectors;
 	qlimits->max_segments = limits->zql_max_segments;
 	qlimits->max_segment_size = limits->zql_max_segment_size;
 	qlimits->io_opt = limits->zql_io_opt;
+	qlimits->physical_block_size = limits->zql_physical_block_size;
+	qlimits->max_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors;
+	qlimits->discard_granularity = limits->zql_discard_granularity;
+#ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	qlimits->features =
+	    BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT;
+#endif
 }
-#else
+#endif
+
 static void
 zvol_queue_limits_apply(zvol_queue_limits_t *limits,
     struct request_queue *queue)
 {
+#ifndef HAVE_BLK_ALLOC_DISK_2ARG
 	blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors);
 	blk_queue_max_segments(queue, limits->zql_max_segments);
 	blk_queue_max_segment_size(queue, limits->zql_max_segment_size);
 	blk_queue_io_opt(queue, limits->zql_io_opt);
-}
+	blk_queue_physical_block_size(queue, limits->zql_physical_block_size);
+	blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors);
+	blk_queue_discard_granularity(queue, limits->zql_discard_granularity);
+#endif
+#ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES
+	blk_queue_set_write_cache(queue, B_TRUE);
+	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue);
 #endif
+}
 
 static int
 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits)
 {
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)
 #if defined(HAVE_BLK_ALLOC_DISK)
 	zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE);
 	if (zso->zvo_disk == NULL)
 		return (1);
 
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
 	struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		return (1);
 	}
 
 	zso->zvo_disk = disk;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 	zso->zvo_queue = zso->zvo_disk->queue;
+
 #else
 	zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
 		return (1);
 
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
 		blk_cleanup_queue(zso->zvo_queue);
 		return (1);
 	}
 
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_BLK_ALLOC_DISK */
 #else
 	zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE);
 	if (zso->zvo_queue == NULL)
 		return (1);
 
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
 		blk_cleanup_queue(zso->zvo_queue);
 		return (1);
 	}
 
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
+
+	zvol_queue_limits_apply(limits, zso->zvo_queue);
+
 	return (0);
 
 }
 
 static int
 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits)
 {
 #ifdef HAVE_BLK_MQ
 	struct zvol_state_os *zso = zv->zv_zso;
 
 	/* Allocate our blk-mq tag_set */
 	if (zvol_blk_mq_alloc_tag_set(zv) != 0)
 		return (1);
 
 #if defined(HAVE_BLK_ALLOC_DISK)
 	zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv);
 	if (zso->zvo_disk == NULL) {
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 	zso->zvo_queue = zso->zvo_disk->queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG)
 	struct queue_limits qlimits;
 	zvol_queue_limits_convert(limits, &qlimits);
 	struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv);
 	if (IS_ERR(disk)) {
 		zso->zvo_disk = NULL;
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 
 	zso->zvo_disk = disk;
 	zso->zvo_queue = zso->zvo_disk->queue;
 	zso->zvo_disk->minors = ZVOL_MINORS;
 #else
 	zso->zvo_disk = alloc_disk(ZVOL_MINORS);
 	if (zso->zvo_disk == NULL) {
 		blk_cleanup_queue(zso->zvo_queue);
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 	/* Allocate queue */
 	zso->zvo_queue = blk_mq_init_queue(&zso->tag_set);
 	if (IS_ERR(zso->zvo_queue)) {
 		blk_mq_free_tag_set(&zso->tag_set);
 		return (1);
 	}
 
 	/* Our queue is now created, assign it to our disk */
 	zso->zvo_disk->queue = zso->zvo_queue;
-	zvol_queue_limits_apply(limits, zso->zvo_queue);
-
 #endif
+
+	zvol_queue_limits_apply(limits, zso->zvo_queue);
 #endif
+
 	return (0);
 }
 
 /*
  * Allocate memory for a new zvol_state_t and setup the required
  * request queue and generic disk structures for the block device.
  */
 static zvol_state_t *
-zvol_alloc(dev_t dev, const char *name)
+zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize)
 {
 	zvol_state_t *zv;
 	struct zvol_state_os *zso;
 	uint64_t volmode;
 	int ret;
 
 	if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0)
 		return (NULL);
 
 	if (volmode == ZFS_VOLMODE_DEFAULT)
 		volmode = zvol_volmode;
 
 	if (volmode == ZFS_VOLMODE_NONE)
 		return (NULL);
 
 	zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
 	zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP);
 	zv->zv_zso = zso;
 	zv->zv_volmode = volmode;
+	zv->zv_volblocksize = volblocksize;
 
 	list_link_init(&zv->zv_next);
 	mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL);
 
 #ifdef HAVE_BLK_MQ
 	zv->zv_zso->use_blk_mq = zvol_use_blk_mq;
 #endif
 
 	zvol_queue_limits_t limits;
 	zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq);
 
 	/*
 	 * The block layer has 3 interfaces for getting BIOs:
 	 *
 	 * 1. blk-mq request queues (new)
 	 * 2. submit_bio() (oldest)
 	 * 3. regular request queues (old).
 	 *
 	 * Each of those interfaces has two permutations:
 	 *
 	 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates
 	 *    both the disk and its queue (5.14 kernel or newer)
 	 *
 	 * b) We don't have blk_*alloc_disk(), and have to allocate the
 	 *    disk and the queue separately. (5.13 kernel or older)
 	 */
 	if (zv->zv_zso->use_blk_mq) {
 		ret = zvol_alloc_blk_mq(zv, &limits);
 		zso->zvo_disk->fops = &zvol_ops_blk_mq;
 	} else {
 		ret = zvol_alloc_non_blk_mq(zso, &limits);
 		zso->zvo_disk->fops = &zvol_ops;
 	}
 	if (ret != 0)
 		goto out_kmem;
 
-	blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE);
-
 	/* Limit read-ahead to a single page to prevent over-prefetching. */
 	blk_queue_set_read_ahead(zso->zvo_queue, 1);
 
 	if (!zv->zv_zso->use_blk_mq) {
 		/* Disable write merging in favor of the ZIO pipeline. */
 		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue);
 	}
 
-	/* Enable /proc/diskstats */
-	blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue);
-
 	zso->zvo_queue->queuedata = zv;
 	zso->zvo_dev = dev;
 	zv->zv_open_count = 0;
 	strlcpy(zv->zv_name, name, MAXNAMELEN);
 
 	zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL);
 	rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL);
 
 	zso->zvo_disk->major = zvol_major;
 	zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE;
 
 	/*
 	 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices.
 	 * This is accomplished by limiting the number of minors for the
 	 * device to one and explicitly disabling partition scanning.
 	 */
 	if (volmode == ZFS_VOLMODE_DEV) {
 		zso->zvo_disk->minors = 1;
 		zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT;
 		zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART;
 	}
 
 	zso->zvo_disk->first_minor = (dev & MINORMASK);
 	zso->zvo_disk->private_data = zv;
 	snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d",
 	    ZVOL_DEV_NAME, (dev & MINORMASK));
 
 	return (zv);
 
 out_kmem:
 	kmem_free(zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 	return (NULL);
 }
 
 /*
  * Cleanup then free a zvol_state_t which was created by zvol_alloc().
  * At this time, the structure is not opened by anyone, is taken off
  * the zvol_state_list, and has its private data set to NULL.
  * The zvol_state_lock is dropped.
  *
  * This function may take many milliseconds to complete (e.g. we've seen
  * it take over 256ms), due to the calls to "blk_cleanup_queue" and
  * "del_gendisk". Thus, consumers need to be careful to account for this
  * latency when calling this function.
  */
 void
 zvol_os_free(zvol_state_t *zv)
 {
 
 	ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock));
 	ASSERT(!MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT0(zv->zv_open_count);
 	ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL);
 
 	rw_destroy(&zv->zv_suspend_lock);
 	zfs_rangelock_fini(&zv->zv_rangelock);
 
 	del_gendisk(zv->zv_zso->zvo_disk);
 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \
 	(defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG))
 #if defined(HAVE_BLK_CLEANUP_DISK)
 	blk_cleanup_disk(zv->zv_zso->zvo_disk);
 #else
 	put_disk(zv->zv_zso->zvo_disk);
 #endif
 #else
 	blk_cleanup_queue(zv->zv_zso->zvo_queue);
 	put_disk(zv->zv_zso->zvo_disk);
 #endif
 
 #ifdef HAVE_BLK_MQ
 	if (zv->zv_zso->use_blk_mq)
 		blk_mq_free_tag_set(&zv->zv_zso->tag_set);
 #endif
 
 	ida_simple_remove(&zvol_ida,
 	    MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS);
 
 	mutex_destroy(&zv->zv_state_lock);
 	dataset_kstats_destroy(&zv->zv_kstat);
 
 	kmem_free(zv->zv_zso, sizeof (struct zvol_state_os));
 	kmem_free(zv, sizeof (zvol_state_t));
 }
 
 void
 zvol_wait_close(zvol_state_t *zv)
 {
 }
 
 struct add_disk_work {
 	struct delayed_work work;
 	struct gendisk *disk;
 	int error;
 };
 
 static int
 __zvol_os_add_disk(struct gendisk *disk)
 {
 	int error = 0;
 #ifdef HAVE_ADD_DISK_RET
 	error = add_disk(disk);
 #else
 	add_disk(disk);
 #endif
 	return (error);
 }
 
 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)
 static void
 zvol_os_add_disk_work(struct work_struct *work)
 {
 	struct add_disk_work *add_disk_work;
 	add_disk_work = container_of(work, struct add_disk_work, work.work);
 	add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk);
 }
 #endif
 
 /*
  * SPECIAL CASE:
  *
  * This function basically calls add_disk() from a workqueue.   You may be
  * thinking: why not just call add_disk() directly?
  *
  * When you call add_disk(), the zvol appears to the world.  When this happens,
  * the kernel calls disk_scan_partitions() on the zvol, which behaves
  * differently on the 6.9+ kernels:
  *
  * - 6.8 and older kernels -
  * disk_scan_partitions()
  *	handle = bdev_open_by_dev(
  *		zvol_open()
  *	bdev_release(handle);
  *		zvol_release()
  *
  *
  * - 6.9+ kernels -
  * disk_scan_partitions()
  * 	file = bdev_file_open_by_dev()
  *		zvol_open()
  *	fput(file)
  *	< wait for return to userspace >
  *		zvol_release()
  *
  * The difference is that the bdev_release() from the 6.8 kernel is synchronous
  * while the fput() from the 6.9 kernel is async.  Or more specifically it's
  * async that has to wait until we return to userspace (since it adds the fput
  * into the caller's work queue with the TWA_RESUME flag set).  This is not the
  * behavior we want, since we want do things like create+destroy a zvol within
  * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the
  * reference to the zvol while we're in the IOCTL, which can't wait until we
  * return to userspace.
  *
  * We can get around this since fput() has a special codepath for when it's
  * running in a kernel thread or interrupt.  In those cases, it just puts the
  * fput into the system workqueue, which we can force to run with
  * __flush_workqueue().  That is why we call add_disk() from a workqueue - so it
  * run from a kernel thread and "tricks" the fput() codepaths.
  *
  * Note that __flush_workqueue() is slowly getting deprecated.  This may be ok
  * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via
  * fput) to happen, which it eventually, naturally, will from the system_wq
  * without us explicitly calling __flush_workqueue().
  */
 static int
 zvol_os_add_disk(struct gendisk *disk)
 {
 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH)	/* 6.9+ kernel */
 	struct add_disk_work add_disk_work;
 
 	INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work);
 	add_disk_work.disk = disk;
 	add_disk_work.error = 0;
 
 	/* Use *_delayed_work functions since they're not GPL'd */
 	schedule_delayed_work(&add_disk_work.work, 0);
 	flush_delayed_work(&add_disk_work.work);
 
 	__flush_workqueue(system_wq);
 	return (add_disk_work.error);
 #else	/* <= 6.8 kernel */
 	return (__zvol_os_add_disk(disk));
 #endif
 }
 
 /*
  * Create a block device minor node and setup the linkage between it
  * and the specified volume.  Once this function returns the block
  * device is live and ready for use.
  */
 int
 zvol_os_create_minor(const char *name)
 {
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t *doi;
 	uint64_t volsize;
 	uint64_t len;
 	unsigned minor = 0;
 	int error = 0;
 	int idx;
 	uint64_t hash = zvol_name_hash(name);
 	bool replayed_zil = B_FALSE;
 
 	if (zvol_inhibit_dev)
 		return (0);
 
 	idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP));
 	if (idx < 0)
 		return (SET_ERROR(-idx));
 	minor = idx << ZVOL_MINOR_BITS;
 	if (MINOR(minor) != minor) {
 		/* too many partitions can cause an overflow */
 		zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u",
 		    name, minor, MINOR(minor));
 		ida_simple_remove(&zvol_ida, idx);
 		return (SET_ERROR(EINVAL));
 	}
 
 	zv = zvol_find_by_name_hash(name, hash, RW_NONE);
 	if (zv) {
 		ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 		mutex_exit(&zv->zv_state_lock);
 		ida_simple_remove(&zvol_ida, idx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 
 	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os);
 	if (error)
 		goto out_doi;
 
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 	if (error)
 		goto out_dmu_objset_disown;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		goto out_dmu_objset_disown;
 
-	zv = zvol_alloc(MKDEV(zvol_major, minor), name);
+	zv = zvol_alloc(MKDEV(zvol_major, minor), name,
+	    doi->doi_data_block_size);
 	if (zv == NULL) {
 		error = SET_ERROR(EAGAIN);
 		goto out_dmu_objset_disown;
 	}
 	zv->zv_hash = hash;
 
 	if (dmu_objset_is_snapshot(os))
 		zv->zv_flags |= ZVOL_RDONLY;
 
-	zv->zv_volblocksize = doi->doi_data_block_size;
 	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
 
 	set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9);
 
-
-
-	blk_queue_physical_block_size(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
-	blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue,
-	    (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9);
-	blk_queue_discard_granularity(zv->zv_zso->zvo_queue,
-	    zv->zv_volblocksize);
 #ifdef QUEUE_FLAG_DISCARD
 	blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue);
 #endif
 #ifdef QUEUE_FLAG_NONROT
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue);
 #endif
 #ifdef QUEUE_FLAG_ADD_RANDOM
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue);
 #endif
 	/* This flag was introduced in kernel version 4.12. */
 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH
 	blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue);
 #endif
 
 	ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL);
 	error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset);
 	if (error)
 		goto out_dmu_objset_disown;
 	ASSERT3P(zv->zv_zilog, ==, NULL);
 	zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums);
 	if (spa_writeable(dmu_objset_spa(os))) {
 		if (zil_replay_disable)
 			replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE);
 		else
 			replayed_zil = zil_replay(os, zv, zvol_replay_vector);
 	}
 	if (replayed_zil)
 		zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
 
 	/*
 	 * When udev detects the addition of the device it will immediately
 	 * invoke blkid(8) to determine the type of content on the device.
 	 * Prefetching the blocks commonly scanned by blkid(8) will speed
 	 * up this process.
 	 */
 	len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE);
 	if (len > 0) {
 		dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ);
 		dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len,
 		    ZIO_PRIORITY_SYNC_READ);
 	}
 
 	zv->zv_objset = NULL;
 out_dmu_objset_disown:
 	dmu_objset_disown(os, B_TRUE, FTAG);
 out_doi:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	/*
 	 * Keep in mind that once add_disk() is called, the zvol is
 	 * announced to the world, and zvol_open()/zvol_release() can
 	 * be called at any time. Incidentally, add_disk() itself calls
 	 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close()
 	 * directly as well.
 	 */
 	if (error == 0) {
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_insert(zv);
 		rw_exit(&zvol_state_lock);
 		error = zvol_os_add_disk(zv->zv_zso->zvo_disk);
 	} else {
 		ida_simple_remove(&zvol_ida, idx);
 	}
 
 	return (error);
 }
 
 void
 zvol_os_rename_minor(zvol_state_t *zv, const char *newname)
 {
 	int readonly = get_disk_ro(zv->zv_zso->zvo_disk);
 
 	ASSERT(RW_LOCK_HELD(&zvol_state_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	strlcpy(zv->zv_name, newname, sizeof (zv->zv_name));
 
 	/* move to new hashtable entry  */
 	zv->zv_hash = zvol_name_hash(newname);
 	hlist_del(&zv->zv_hlink);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 
 	/*
 	 * The block device's read-only state is briefly changed causing
 	 * a KOBJ_CHANGE uevent to be issued.  This ensures udev detects
 	 * the name change and fixes the symlinks.  This does not change
 	 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never
 	 * changes.  This would normally be done using kobject_uevent() but
 	 * that is a GPL-only symbol which is why we need this workaround.
 	 */
 	set_disk_ro(zv->zv_zso->zvo_disk, !readonly);
 	set_disk_ro(zv->zv_zso->zvo_disk, readonly);
 
 	dataset_kstats_rename(&zv->zv_kstat, newname);
 }
 
 void
 zvol_os_set_disk_ro(zvol_state_t *zv, int flags)
 {
 
 	set_disk_ro(zv->zv_zso->zvo_disk, flags);
 }
 
 void
 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity)
 {
 
 	set_capacity(zv->zv_zso->zvo_disk, capacity);
 }
 
 int
 zvol_init(void)
 {
 	int error;
 
 	/*
 	 * zvol_threads is the module param the user passes in.
 	 *
 	 * zvol_actual_threads is what we use internally, since the user can
 	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
 	 */
 	static unsigned int zvol_actual_threads;
 
 	if (zvol_threads == 0) {
 		/*
 		 * See dde9380a1 for why 32 was chosen here.  This should
 		 * probably be refined to be some multiple of the number
 		 * of CPUs.
 		 */
 		zvol_actual_threads = MAX(num_online_cpus(), 32);
 	} else {
 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
 	}
 
 	/*
 	 * Use atleast 32 zvol_threads but for many core system,
 	 * prefer 6 threads per taskq, but no more taskqs
 	 * than threads in them on large systems.
 	 *
 	 *                 taskq   total
 	 * cpus    taskqs  threads threads
 	 * ------- ------- ------- -------
 	 * 1       1       32       32
 	 * 2       1       32       32
 	 * 4       1       32       32
 	 * 8       2       16       32
 	 * 16      3       11       33
 	 * 32      5       7        35
 	 * 64      8       8        64
 	 * 128     11      12       132
 	 * 256     16      16       256
 	 */
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs);
 	if (num_tqs == 0) {
 		num_tqs = 1 + num_online_cpus() / 6;
 		while (num_tqs * num_tqs > zvol_actual_threads)
 			num_tqs--;
 	}
 	uint_t per_tq_thread = zvol_actual_threads / num_tqs;
 	if (per_tq_thread * num_tqs < zvol_actual_threads)
 		per_tq_thread++;
 	ztqs->tqs_cnt = num_tqs;
 	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 	error = register_blkdev(zvol_major, ZVOL_DRIVER);
 	if (error) {
 		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *));
 		ztqs->tqs_taskq = NULL;
 		printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error);
 		return (error);
 	}
 
 #ifdef HAVE_BLK_MQ
 	if (zvol_blk_mq_queue_depth == 0) {
 		zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ;
 	} else {
 		zvol_actual_blk_mq_queue_depth =
 		    MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ);
 	}
 
 	if (zvol_blk_mq_threads == 0) {
 		zvol_blk_mq_actual_threads = num_online_cpus();
 	} else {
 		zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1),
 		    1024);
 	}
 #endif
 	for (uint_t i = 0; i < num_tqs; i++) {
 		char name[32];
 		(void) snprintf(name, sizeof (name), "%s_tq-%u",
 		    ZVOL_DRIVER, i);
 		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
 		    maxclsyspri, per_tq_thread, INT_MAX,
 		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 		if (ztqs->tqs_taskq[i] == NULL) {
 			for (int j = i - 1; j >= 0; j--)
 				taskq_destroy(ztqs->tqs_taskq[j]);
 			unregister_blkdev(zvol_major, ZVOL_DRIVER);
 			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
 			    sizeof (taskq_t *));
 			ztqs->tqs_taskq = NULL;
 			return (-ENOMEM);
 		}
 	}
 
 	zvol_init_impl();
 	ida_init(&zvol_ida);
 	return (0);
 }
 
 void
 zvol_fini(void)
 {
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	zvol_fini_impl();
 	unregister_blkdev(zvol_major, ZVOL_DRIVER);
 
 	if (ztqs->tqs_taskq == NULL) {
 		ASSERT3U(ztqs->tqs_cnt, ==, 0);
 	} else {
 		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
 			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
 			taskq_destroy(ztqs->tqs_taskq[i]);
 		}
 		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
 		    sizeof (taskq_t *));
 		ztqs->tqs_taskq = NULL;
 	}
 
 	ida_destroy(&zvol_ida);
 }
 
 /* BEGIN CSTYLED */
 module_param(zvol_inhibit_dev, uint, 0644);
 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes");
 
 module_param(zvol_major, uint, 0444);
 MODULE_PARM_DESC(zvol_major, "Major number for zvol device");
 
 module_param(zvol_threads, uint, 0444);
 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set"
     "to 0 to use all active CPUs");
 
 module_param(zvol_request_sync, uint, 0644);
 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests");
 
 module_param(zvol_max_discard_blocks, ulong, 0444);
 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard");
 
 module_param(zvol_num_taskqs, uint, 0444);
 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs");
 
 module_param(zvol_prefetch_bytes, uint, 0644);
 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end");
 
 module_param(zvol_volmode, uint, 0644);
 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value");
 
 #ifdef HAVE_BLK_MQ
 module_param(zvol_blk_mq_queue_depth, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth");
 
 module_param(zvol_use_blk_mq, uint, 0644);
 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols");
 
 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644);
 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
     "Process volblocksize blocks per thread");
 #endif
 
 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
 module_param(zvol_open_timeout_ms, uint, 0644);
 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
 #endif
 
 /* END CSTYLED */
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index f25afa312cb4..d7ee9d32a75b 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -1,10744 +1,10749 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2020, Delphix. All rights reserved.
  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (ranging from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal ARC algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * ARC list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each ARC state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an ARC list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
  * metadata limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed.  For example, when using the ZPL each dentry
  * holds a references on a znode.  These dentries must be pruned before
  * the arc buffer holding the znode can be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 /*
  * ARC operation:
  *
  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
  * This structure can point either to a block that is still in the cache or to
  * one that is only accessible in an L2 ARC device, or it can provide
  * information about a block that was recently evicted. If a block is
  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
  * information to retrieve it from the L2ARC device. This information is
  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
  * that is in this state cannot access the data directly.
  *
  * Blocks that are actively being referenced or have not been evicted
  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
  * ability to store the physical data (b_pabd) associated with the DVA of the
  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
  * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
  * consumer. The ARC will provide references to this data and will keep it
  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
  * data block and will evict any arc_buf_t that is no longer referenced. The
  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
  * "overhead_size" kstat.
  *
  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
  * compressed form. The typical case is that consumers will want uncompressed
  * data, and when that happens a new data buffer is allocated where the data is
  * decompressed for them to use. Currently the only consumer who wants
  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
  * with the arc_buf_hdr_t.
  *
  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
  * first one is owned by a compressed send consumer (and therefore references
  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
  * used by any other consumer (and has its own uncompressed copy of the data
  * buffer).
  *
  *   arc_buf_hdr_t
  *   +-----------+
  *   | fields    |
  *   | common to |
  *   | L1- and   |
  *   | L2ARC     |
  *   +-----------+
  *   | l2arc_buf_hdr_t
  *   |           |
  *   +-----------+
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
  *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
  *                 |           +-----------+ |   |b_data     +-+
  *                 +->+------+               |   +-----------+ |
  *        compressed  |      |               |                 |
  *           data     |      |<--------------+                 | uncompressed
  *                    +------+          compressed,            |     data
  *                                        shared               +-->+------+
  *                                         data                    |      |
  *                                                                 |      |
  *                                                                 +------+
  *
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
  * be anywhere in the hdr's list.
  *
  * The diagram below shows an example of an uncompressed ARC hdr that is
  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
  * the last element in the buf list):
  *
  *                arc_buf_hdr_t
  *                +-----------+
  *                |           |
  *                |           |
  *                |           |
  *                +-----------+
  * l2arc_buf_hdr_t|           |
  *                |           |
  *                +-----------+
  * l1arc_buf_hdr_t|           |
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
  *                              +->+------+             |   +---------+ |
  *                                 |      |             |               |
  *                   uncompressed  |      |             |               |
  *                        data     +------+             |               |
  *                                    ^                 +->+------+     |
  *                                    |       uncompressed |      |     |
  *                                    |           data     |      |     |
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will memcpy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
  * L2ARC to determine if the contents are valid. However, if the compressed
  * ARC is disabled, then the L2ARC's block must be transformed to look
  * like the physical block in the main data pool before comparing the
  * checksum and determining its validity.
  *
  * The L1ARC has a slightly different system for storing encrypted data.
  * Raw (encrypted + possibly compressed) data has a few subtle differences from
  * data that is just compressed. The biggest difference is that it is not
  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
  * The other difference is that encryption cannot be treated as a suggestion.
  * If a caller would prefer compressed data, but they actually wind up with
  * uncompressed data the worst thing that could happen is there might be a
  * performance hit. If the caller requests encrypted data, however, we must be
  * sure they actually get it or else secret information could be leaked. Raw
  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
  * may have both an encrypted version and a decrypted version of its data at
  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
  * copied out of this header. To avoid complications with b_pabd, raw buffers
  * cannot be shared.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/zfs_refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
 #include <sys/zil.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 #include <sys/wmsum.h>
 #include <cityhash.h>
 #include <sys/vdev_trim.h>
 #include <sys/zfs_racct.h>
 #include <sys/zstd/zstd.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
 /*
  * This thread's job is to keep enough free memory in the system, by
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
 static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
 static arc_buf_hdr_t **arc_state_evict_markers;
 static int arc_state_evict_marker_count;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 static clock_t arc_last_uncached_flush;
 
 /*
  * Count of bytes evicted since boot.
  */
 static uint64_t arc_evict_count;
 
 /*
  * List of arc_evict_waiter_t's, representing threads waiting for the
  * arc_evict_count to reach specific values.
  */
 static list_t arc_evict_waiters;
 
 /*
  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
  * the requested amount of data to be evicted.  For example, by default for
  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
  * Since this is above 100%, it ensures that progress is made towards getting
  * arc_size under arc_c.  Since this is finite, it ensures that allocations
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
 static uint_t zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
  * dropping the sublist lock and evicting from another sublist. A lower
  * value means we're more likely to evict the "correct" header (i.e. the
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
 static uint_t zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
 uint_t arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
 static const int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 static int zfs_arc_overflow_shift = 8;
 
 /* log2(fraction of arc to reclaim) */
 uint_t arc_shrink_shift = 7;
 
 /* percent of pagecache to reclaim arc to */
 #ifdef _KERNEL
 uint_t zfs_arc_pc_percent = 0;
 #endif
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
 uint_t		arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static uint_t		arc_min_prefetch_ms;
 static uint_t		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
 uint_t arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 uint64_t zfs_arc_max = 0;
 uint64_t zfs_arc_min = 0;
 static uint64_t zfs_arc_dnode_limit = 0;
 static uint_t zfs_arc_dnode_reduce_percent = 10;
 static uint_t zfs_arc_grow_retry = 0;
 static uint_t zfs_arc_shrink_shift = 0;
 uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
  * ARC dirty data constraints for arc_tempreserve_space() throttle:
  * * total dirty data limit
  * * anon block dirty limit
  * * each pool's anon allowance
  */
 static const unsigned long zfs_arc_dirty_limit_percent = 50;
 static const unsigned long zfs_arc_anon_limit_percent = 25;
 static const unsigned long zfs_arc_pool_dirty_percent = 20;
 
 /*
  * Enable or disable compressed arc buffers.
  */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
  * Balance between metadata and data on ghost hits.  Values above 100
  * increase metadata caching by proportionally reducing effect of ghost
  * data hits on target data/metadata rate.
  */
 static uint_t zfs_arc_meta_balance = 500;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
 static uint_t zfs_arc_dnode_limit_percent = 10;
 
 /*
  * These tunables are Linux-specific
  */
 static uint64_t zfs_arc_sys_free = 0;
 static uint_t zfs_arc_min_prefetch_ms = 0;
 static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
 static uint_t zfs_arc_lotsfree_percent = 10;
 
 /*
  * Number of arc_prune threads
  */
 static int zfs_arc_prune_task_threads = 1;
 
 /* The 7 states: */
 arc_state_t ARC_anon;
 arc_state_t ARC_mru;
 arc_state_t ARC_mru_ghost;
 arc_state_t ARC_mfu;
 arc_state_t ARC_mfu_ghost;
 arc_state_t ARC_l2c_only;
 arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "iohits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_iohits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "uncached_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "meta",			KSTAT_DATA_UINT64 },
 	{ "pd",				KSTAT_DATA_UINT64 },
 	{ "pm",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "compressed_size",		KSTAT_DATA_UINT64 },
 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
 	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "dbuf_size",			KSTAT_DATA_UINT64 },
 	{ "dnode_size",			KSTAT_DATA_UINT64 },
 	{ "bonus_size",			KSTAT_DATA_UINT64 },
 #if defined(COMPAT_FREEBSD11)
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_data",			KSTAT_DATA_UINT64 },
 	{ "anon_metadata",		KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_data",			KSTAT_DATA_UINT64 },
 	{ "mru_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_data",			KSTAT_DATA_UINT64 },
 	{ "mfu_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "uncached_size",		KSTAT_DATA_UINT64 },
 	{ "uncached_data",		KSTAT_DATA_UINT64 },
 	{ "uncached_metadata",		KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
 	{ "predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
 	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
 	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
 };
 
 arc_sums_t arc_sums;
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 /*
  * This macro allows us to use kstats as floating averages. Each time we
  * update this kstat, we first factor it and the update value by
  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
  * average. This macro assumes that integer loads and stores are atomic, but
  * is not safe for multiple writers updating the kstat in parallel (only the
  * last writer's update will remain).
  */
 #define	ARCSTAT_F_AVG_FACTOR	3
 #define	ARCSTAT_F_AVG(stat, value) \
 	do { \
 		uint64_t x = ARCSTAT(stat); \
 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
 		    (value) / ARCSTAT_F_AVG_FACTOR; \
 		ARCSTAT(stat) = x; \
 	} while (0)
 
 static kstat_t			*arc_ksp;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 hrtime_t arc_growtime;
 list_t arc_prune_list;
 kmutex_t arc_prune_mtx;
 taskq_t *arc_prune_taskq;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_UNCACHED(hdr)	((hdr)->b_flags & ARC_FLAG_UNCACHED)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 #define	HDR_HAS_RABD(hdr)	\
 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
 	(hdr)->b_crypt_hdr.b_rabd != NULL)
 #define	HDR_ENCRYPTED(hdr)	\
 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 #define	HDR_AUTHENTICATED(hdr)	\
 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 
 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	BUF_LOCKS 2048
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK(idx)	(&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		2			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
  * and each of the state has two types: data and metadata.
  */
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
 int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
 int l2arc_feed_again = B_TRUE;			/* turbo warmup */
 int l2arc_norw = B_FALSE;			/* no reads during writes */
 static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	abd_t			*l2rcb_abd;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 typedef enum arc_fill_flags {
 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
 typedef enum arc_ovf_level {
 	ARC_OVF_NONE,			/* ARC within target size. */
 	ARC_OVF_SOME,			/* ARC is slightly overflowed. */
 	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
 } arc_ovf_level_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
 	ARC_HDR_USE_RESERVE = 0x4,
 	ARC_HDR_ALLOC_LINEAR = 0x8,
 };
 
 
 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
     const void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 static void arc_hdr_destroy(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
 static void arc_buf_watch(arc_buf_t *);
 static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
 
 static void arc_prune_async(uint64_t adjust);
 
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 #define	l2arc_hdr_arcstats_increment_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 
 /*
  * l2arc_exclude_special : A zfs module parameter that controls whether buffers
  * 		present on special vdevs are eligibile for caching in L2ARC. If
  * 		set to 1, exclude dbufs on special vdevs from being cached to
  * 		L2ARC.
  */
 int l2arc_exclude_special = 0;
 
 /*
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
  */
 static int l2arc_mfuonly = 0;
 
 /*
  * L2ARC TRIM
  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
  * 		the current write size (l2arc_write_max) we should TRIM if we
  * 		have filled the device. It is defined as a percentage of the
  * 		write size. If set to 100 we trim twice the space required to
  * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
  * 		It also enables TRIM of the whole L2ARC device upon creation or
  * 		addition to an existing pool or if the header of the device is
  * 		invalid upon importing a pool or onlining a cache device. The
  * 		default is 0, which disables TRIM on L2ARC altogether as it can
  * 		put significant stress on the underlying storage devices. This
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
 static uint64_t l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
  *
  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
  * 		an L2ARC device (either at pool import or later) will attempt
  * 		to rebuild L2ARC buffer contents.
  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
  * 		whether log blocks are written to the L2ARC device. If the L2ARC
  * 		device is less than 1GB, the amount of data l2arc_evict()
  * 		evicts is significant compared to the amount of restored L2ARC
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
 static int l2arc_rebuild_enabled = B_TRUE;
 static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 static int l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io);
 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 
 /* L2ARC persistence block restoration routines. */
 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
 static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *lbp);
 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     const arc_buf_hdr_t *ab);
 boolean_t l2arc_range_check_overlap(uint64_t bottom,
     uint64_t top, uint64_t check);
 static void l2arc_blk_fetch_done(zio_t *zio);
 static inline uint64_t
     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 }
 
 #define	HDR_EMPTY(hdr)						\
 	((hdr)->b_dva.dva_word[0] == 0 &&			\
 	(hdr)->b_dva.dva_word[1] == 0)
 
 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 
 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 	uint64_t he = atomic_inc_64_nv(
 	    &arc_stats.arcstat_hash_elements.value.ui64);
 	ARCSTAT_MAX(arcstat_hash_elements_max, he);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	atomic_dec_64(&arc_stats.arcstat_hash_elements.value.ui64);
 
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (int i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 #endif
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_t *buf = vbuf;
 
 	memset(buf, 0, sizeof (arc_buf_t));
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	(void) vbuf;
 
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct = NULL;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 /*
  * This is the size that the buf occupies in memory. If the buf is compressed,
  * it will correspond to the compressed size. You should use this method of
  * getting the buf size unless you explicitly need the logical size.
  */
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
 }
 
 uint64_t
 arc_buf_lsize(arc_buf_t *buf)
 {
 	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
  * This function will return B_TRUE if the buffer is encrypted in memory.
  * This buffer can be decrypted by calling arc_untransform().
  */
 boolean_t
 arc_is_encrypted(arc_buf_t *buf)
 {
 	return (ARC_BUF_ENCRYPTED(buf) != 0);
 }
 
 /*
  * Returns B_TRUE if the buffer represents data that has not had its MAC
  * verified yet.
  */
 boolean_t
 arc_is_unauthenticated(arc_buf_t *buf)
 {
 	return (HDR_NOAUTH(buf->b_hdr) != 0);
 }
 
 void
 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
     uint8_t *iv, uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
 
 /*
  * Indicates how this buffer is compressed in memory. If it is not compressed
  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
  * arc_untransform() as long as it is also unencrypted.
  */
 enum zio_compress
 arc_get_compression(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
 }
 
 /*
  * Return the compression algorithm used to store this data in the ARC. If ARC
  * compression is enabled or this is an encrypted block, this will be the same
  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
  */
 static inline enum zio_compress
 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
 {
 	return (HDR_COMPRESSION_ENABLED(hdr) ?
 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
 }
 
 uint8_t
 arc_get_complevel(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_complevel);
 }
 
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
 	 * already being shared" requirement prevents us from doing that.
 	 */
 
 	return (shared);
 }
 
 /*
  * Free the checksum associated with this header. If there is no checksum, this
  * is a no-op.
  */
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
 #ifdef ZFS_DEBUG
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * Return true iff at least one of the bufs on hdr is not compressed.
  * Encrypted buffers count as compressed.
  */
 static boolean_t
 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 {
 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
 
 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
 		if (!ARC_BUF_COMPRESSED(b)) {
 			return (B_TRUE);
 		}
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
  * matches the checksum that is stored in the hdr. If there is no checksum,
  * or if the buf is compressed, this is a no-op.
  */
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 
 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * This function makes the assumption that data stored in the L2ARC
  * will be transformed exactly as it is in the main pool. Because of
  * this we can verify the checksum against the reading process's bp.
  */
 static boolean_t
 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
 	/*
 	 * Block pointers always store the checksum for the logical data.
 	 * If the block pointer has the gang bit set, then the checksum
 	 * it represents is for the reconstituted data and not for an
 	 * individual gang member. The zio pipeline, however, must be able to
 	 * determine the checksum of each of the gang constituents so it
 	 * treats the checksum comparison differently than what we need
 	 * for l2arc blocks. This prevents us from using the
 	 * zio_checksum_error() interface directly. Instead we must call the
 	 * zio_checksum_error_impl() so that we can ensure the checksum is
 	 * generated using the correct checksum algorithm and accounts for the
 	 * logical I/O size and not just a gang fragment.
 	 */
 	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 }
 
 /*
  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
  * isn't modified later on. If buf is compressed or there is already a checksum
  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
  */
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	(void) sig, (void) unused;
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
 #else
 	(void) buf;
 #endif
 }
 
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
 #else
 	(void) buf;
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		type = ARC_BUFC_METADATA;
 	} else {
 		type = ARC_BUFC_DATA;
 	}
 	VERIFY3U(hdr->b_type, ==, type);
 	return (type);
 }
 
 boolean_t
 arc_is_metadata(arc_buf_t *buf)
 {
 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	arc_cksum_verify(buf);
 
 	/*
 	 * Compressed buffers do not manipulate the b_freeze_cksum.
 	 */
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	arc_cksum_free(hdr);
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
 	arc_cksum_compute(buf);
 }
 
 /*
  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
  * the following functions should be used to ensure that the flags are
  * updated in a thread-safe way. When manipulating the flags either
  * the hash_lock must be held or the hdr must be undiscoverable. This
  * ensures that we're not racing with any other threads when updating
  * the flags.
  */
 static inline void
 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags |= flags;
 }
 
 static inline void
 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags &= ~flags;
 }
 
 /*
  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
  * done in a special way since we have to clear and set bits
  * at the same time. Consumers that wish to set the compression bits
  * must use this function to ensure that the flags are updated in
  * thread-safe manner.
  */
 static void
 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Holes and embedded blocks will always have a psize = 0 so
 	 * we ignore the compression of the blkptr and set the
 	 * want to uncompress them. Mark them as uncompressed.
 	 */
 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
 
 	HDR_SET_COMPRESS(hdr, cmp);
 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
 }
 
 /*
  * Looks for another buf on the same hdr which has the data decompressed, copies
  * from it, and returns true. If no such buf exists, returns false.
  */
 static boolean_t
 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t copied = B_FALSE;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 
 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
 	    from = from->b_next) {
 		/* can't use our own data buffer */
 		if (from == buf) {
 			continue;
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
 			memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 #endif
 
 	return (copied);
 }
 
 /*
  * Allocates an ARC buf header that's in an evicted & L2-cached state.
  * This is used during l2arc reconstruction to make empty ARC buffers
  * which circumvent the regular disk->arc->l2arc path and instead come
  * into being in the reverse order, i.e. l2arc->arc.
  */
 static arc_buf_hdr_t *
 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
     dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth,
     enum zio_compress compress, uint8_t complevel, boolean_t protected,
     boolean_t prefetch, arc_state_type_t arcs_state)
 {
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	if (prefetch)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
 
 	hdr->b_dva = dva;
 
 	hdr->b_l2hdr.b_dev = dev;
 	hdr->b_l2hdr.b_daddr = daddr;
 	hdr->b_l2hdr.b_arcs_state = arcs_state;
 
 	return (hdr);
 }
 
 /*
  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
 {
 	uint64_t size;
 
 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 	    HDR_GET_PSIZE(hdr) > 0) {
 		size = HDR_GET_PSIZE(hdr);
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
 		size = HDR_GET_LSIZE(hdr);
 	}
 	return (size);
 }
 
 static int
 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 {
 	int ret;
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	void *tmpbuf = NULL;
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
 	 * However, if compressed arc is disabled we will only have the
 	 * decompressed data available to us now. Compress it into a temporary
 	 * abd so we can verify the MAC. The performance overhead of this will
 	 * be relatively low, since most objects in an encrypted objset will
 	 * be encrypted (instead of authenticated) anyway.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, &tmpbuf, lsize, hdr->b_complevel);
 		ASSERT3P(tmpbuf, !=, NULL);
 		ASSERT3U(csize, <=, psize);
 		abd = abd_get_from_buf(tmpbuf, lsize);
 		abd_take_ownership_of_buf(abd, B_TRUE);
 		abd_zero_off(abd, csize, psize - csize);
 	}
 
 	/*
 	 * Authentication is best effort. We authenticate whenever the key is
 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
 	 */
 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
 		ASSERT3U(lsize, ==, psize);
 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	} else {
 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
 		    hdr->b_crypt_hdr.b_mac);
 	}
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
 	else if (ret != ENOENT)
 		goto error;
 
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (0);
 
 error:
 	if (tmpbuf != NULL)
 		abd_free(abd);
 
 	return (ret);
 }
 
 /*
  * This function will take a header that only has raw encrypted data in
  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
  * b_l1hdr.b_pabd. If designated in the header flags, this function will
  * also decompress the data.
  */
 static int
 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
 	void *tmp = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
 	arc_hdr_alloc_abd(hdr, 0);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (no_crypt) {
 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 	}
 
 	/*
 	 * If this header has disabled arc compression but the b_pabd is
 	 * compressed after decrypting it, we need to decompress the newly
 	 * decrypted data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * We want to make sure that we are correctly honoring the
 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 	}
 
 	return (0);
 
 error:
 	arc_hdr_free_abd(hdr, B_FALSE);
 	if (cabd != NULL)
 		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
 
 	return (ret);
 }
 
 /*
  * This function is called during arc_buf_fill() to prepare the header's
  * abd plaintext pointer for use. This involves authenticated protected
  * data and decrypting encrypted data into the plaintext abd.
  */
 static int
 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
     const zbookmark_phys_t *zb, boolean_t noauth)
 {
 	int ret;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	if (hash_lock != NULL)
 		mutex_enter(hash_lock);
 
 	if (HDR_NOAUTH(hdr) && !noauth) {
 		/*
 		 * The caller requested authenticated data but our data has
 		 * not been authenticated yet. Verify the MAC now if we can.
 		 */
 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
 		if (ret != 0)
 			goto error;
 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
 		/*
 		 * If we only have the encrypted version of the data, but the
 		 * unencrypted version was requested we take this opportunity
 		 * to store the decrypted version in the header for future use.
 		 */
 		ret = arc_hdr_decrypt(hdr, spa, zb);
 		if (ret != 0)
 			goto error;
 	}
 
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (0);
 
 error:
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (ret);
 }
 
 /*
  * This function is used by the dbuf code to decrypt bonus buffers in place.
  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
  * block, so we use the hash lock here to protect against concurrent calls to
  * arc_buf_fill().
  */
 static void
 arc_buf_untransform_in_place(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 }
 
 /*
  * Given a buf that has a data buffer attached to it, this function will
  * efficiently fill the buf with data of the specified compression setting from
  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
  * are already sharing a data buf, no copy is performed.
  *
  * If the buf is marked as compressed but uncompressed data was requested, this
  * will allocate a new data buffer for the buf, remove that flag, and fill the
  * buf with uncompressed data. You can't request a compressed buf on a hdr with
  * uncompressed data, and (since we haven't added support for it yet) if you
  * want compressed data your buf must already be marked as compressed and have
  * the correct-sized data buffer.
  */
 static int
 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     arc_fill_flags_t flags)
 {
 	int error = 0;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t hdr_compressed =
 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, !arc_buf_is_shared(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
 	 * b_rabd and potentially byteswap it. We won't be able to do any
 	 * further transforms on it.
 	 */
 	if (encrypted) {
 		ASSERT(HDR_HAS_RABD(hdr));
 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 		goto byteswap;
 	}
 
 	/*
 	 * Adjust encrypted and authenticated headers to accommodate
 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
 	 * allowed to fail decryption due to keys not being loaded
 	 * without being marked as an IO error.
 	 */
 	if (HDR_PROTECTED(hdr)) {
 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
 		    zb, !!(flags & ARC_FILL_NOAUTH));
 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
 			return (error);
 		} else if (error != 0) {
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * There is a special case here for dnode blocks which are
 	 * decrypting their bonus buffers. These blocks may request to
 	 * be decrypted in-place. This is necessary because there may
 	 * be many dnodes pointing into this buffer and there is
 	 * currently no method to synchronize replacing the backing
 	 * b_data buffer and updating all of the pointers. Here we use
 	 * the hash lock to ensure there are no races. If the need
 	 * arises for other types to be decrypted in-place, they must
 	 * add handling here as well.
 	 */
 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
 		ASSERT(!hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT(!encrypted);
 
 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_buf_untransform_in_place(buf);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
 			/* Compute the hdr's checksum if necessary */
 			arc_cksum_compute(buf);
 		}
 
 		return (0);
 	}
 
 	if (hdr_compressed == compressed) {
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(arc_buf_is_shared(buf));
 		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(ARC_BUF_COMPRESSED(buf));
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			ASSERT(!arc_buf_is_shared(buf));
 
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 
 			/* We increased the size of b_data; update overhead */
 			ARCSTAT_INCR(arcstat_overhead_size,
 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
 		}
 
 		/*
 		 * Regardless of the buf's previous compression settings, it
 		 * should not be compressed at the end of this function.
 		 */
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 
 		/*
 		 * Try copying the data from another buf which already has a
 		 * decompressed version. If that's not possible, it's time to
 		 * bite the bullet and decompress the data from the hdr.
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 			    hdr->b_l1hdr.b_pabd, buf->b_data,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
 			 * be impossible, but log it anyway so we can debug it.
 			 */
 			if (error != 0) {
 				zfs_dbgmsg(
 				    "hdr %px, compress %d, psize %d, lsize %d",
 				    hdr, arc_hdr_get_compress(hdr),
 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 				return (SET_ERROR(EIO));
 			}
 		}
 	}
 
 byteswap:
 	/* Byteswap the buf's data if necessary */
 	if (bswap != DMU_BSWAP_NUMFUNCS) {
 		ASSERT(!HDR_SHARED_DATA(hdr));
 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
 	}
 
 	/* Compute the hdr's checksum if necessary */
 	arc_cksum_compute(buf);
 
 	return (0);
 }
 
 /*
  * If this function is being called to decrypt an encrypted buffer or verify an
  * authenticated one, the key must be loaded and a mapping must be made
  * available in the keystore via spa_keystore_create_mapping() or one of its
  * callers.
  */
 int
 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     boolean_t in_place)
 {
 	int ret;
 	arc_fill_flags_t flags = 0;
 
 	if (in_place)
 		flags |= ARC_FILL_IN_PLACE;
 
 	ret = arc_buf_fill(buf, spa, zb, flags);
 	if (ret == ECKSUM) {
 		/*
 		 * Convert authentication and decryption errors to EIO
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
 		spa_log_error(spa, zb, &buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
 
 	return (ret);
 }
 
 /*
  * Increment the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Decrement the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Add a reference to this hdr indicating that someone is actively
  * referencing that memory. When the refcount transitions from 0 to 1,
  * we remove it from the respective arc_state_t list to indicate that
  * it is not evictable.
  */
 static void
 add_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
 		ASSERT(state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	}
 
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    state != arc_anon && state != arc_l2c_only) {
 		/* We don't use the L2-only state list. */
 		multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		arc_evictable_space_decrement(hdr, state);
 	}
 }
 
 /*
  * Remove a reference from this hdr. When the reference transitions from
  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
  * list making it eligible for eviction.
  */
 static int
 remove_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(!GHOST_STATE(state));	/* arc_l2c_only counts as a ghost. */
 
 	if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
 		return (cnt);
 
 	if (state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
 	arc_evictable_space_increment(hdr, state);
 	return (0);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	(void) state_index;
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 
 	if (hdr == NULL)
 		return;
 
 	abi->abi_flags = hdr->b_flags;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
 		abi->abi_bufcnt = 0;
 		for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
 			abi->abi_bufcnt++;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
 	abi->abi_size = arc_hdr_size(hdr);
 }
 
 /*
  * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
 		update_old = (hdr->b_l1hdr.b_buf != NULL ||
 		    hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 		IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
 		    ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
 	if (GHOST_STATE(old_state))
 		update_old = B_TRUE;
 	if (GHOST_STATE(new_state))
 		update_new = B_TRUE;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(new_state, !=, old_state);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			/* remove_reference() saves on insert. */
 			if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 				multilist_remove(&old_state->arcs_list[type],
 				    hdr);
 				arc_evictable_space_decrement(hdr, old_state);
 			}
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_insert(&new_state->arcs_list[type], hdr);
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
 
 	ASSERT(!HDR_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
 
 			/*
 			 * When moving a header to a ghost state, we first
 			 * remove all arc buffers. Thus, we'll have no arc
 			 * buffer to use for the reference. As a result, we
 			 * use the arc header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(
 			    &new_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
 			/*
 			 * When moving a header off of a ghost state,
 			 * the header will not contain any arc buffers.
 			 * We use the arc header pointer for the reference
 			 * which is exactly what we did when we put the
 			 * header on the ghost state.
 			 */
 
 			(void) zfs_refcount_remove_many(
 			    &old_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		hdr->b_l1hdr.b_state = new_state;
 
 		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
 			l2arc_hdr_arcstats_decrement_state(hdr);
 			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
 			l2arc_hdr_arcstats_increment_state(hdr);
 		}
 	}
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
 		ARCSTAT_INCR(arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		/*
 		 * Note: this includes space wasted by all scatter ABD's, not
 		 * just those allocated by the ARC.  But the vast majority of
 		 * scatter ABD's come from the ARC, because other users are
 		 * very short-lived.
 		 */
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, space);
 
 	aggsum_add(&arc_sums.arcstat_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, -space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
 		ARCSTAT_INCR(arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, -space);
 
 	ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
 	aggsum_add(&arc_sums.arcstat_size, -space);
 }
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
  * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	/*
 	 * The criteria for sharing a hdr's data are:
 	 * 1. the buffer is not encrypted
 	 * 2. the hdr's compression matches the buf's compression
 	 * 3. the hdr doesn't need to be byteswapped
 	 * 4. the hdr isn't already being shared
 	 * 5. the buf is either compressed or it is the last buf in the hdr list
 	 *
 	 * Criterion #5 maintains the invariant that shared uncompressed
 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
 	 * might ask, "if a compressed buf is allocated first, won't that be the
 	 * last thing in the list?", but in that case it's impossible to create
 	 * a shared uncompressed buf anyway (because the hdr must be compressed
 	 * to have the compressed buf). You might also think that #3 is
 	 * sufficient to make this guarantee, however it's possible
 	 * (specifically in the rare L2ARC write race mentioned in
 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
 	 * is shareable, but wasn't at the time of its allocation. Rather than
 	 * allow a new shared uncompressed buf to be created and then shuffle
 	 * the list around to make it the last element, this simply disallows
 	 * sharing if the new buf isn't the first to be added.
 	 */
 	ASSERT3P(buf->b_hdr, ==, hdr);
 	boolean_t hdr_compressed =
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
 	return (!ARC_BUF_ENCRYPTED(buf) &&
 	    buf_compressed == hdr_compressed &&
 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
 	    !HDR_SHARED_DATA(hdr) &&
 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
 }
 
 /*
  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
  * copy was made successfully, or an error code otherwise.
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
     const void *tag, boolean_t encrypted, boolean_t compressed,
     boolean_t noauth, boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
 	    hdr->b_type == ARC_BUFC_METADATA);
 	ASSERT3P(ret, !=, NULL);
 	ASSERT3P(*ret, ==, NULL);
 	IMPLY(encrypted, compressed);
 
 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	buf->b_flags = 0;
 
 	add_reference(hdr, tag);
 
 	/*
 	 * We're about to change the hdr's b_flags. We must either
 	 * hold the hash_lock or be undiscoverable.
 	 */
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Only honor requests for compressed bufs if the hdr is actually
 	 * compressed. This must be overridden if the buffer is encrypted since
 	 * encrypted buffers cannot be decompressed.
 	 */
 	if (encrypted) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
 	} else if (compressed &&
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		flags |= ARC_FILL_COMPRESSED;
 	}
 
 	if (noauth) {
 		ASSERT0(encrypted);
 		flags |= ARC_FILL_NOAUTH;
 	}
 
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
 	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
 	 * actively involved in an L2ARC write, because if this buf is used by
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
 	 * need to be ABD-aware.  It must be allocated via
 	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
 	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
 	 * page" buffers because the ABD code needs to handle freeing them
 	 * specially.
 	 */
 	boolean_t can_share = arc_can_share(hdr, buf) &&
 	    !HDR_L2_WRITING(hdr) &&
 	    hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
 	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
 		buf->b_data =
 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 	}
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
 	 * decompress the data.
 	 */
 	if (fill) {
 		ASSERT3P(zb, !=, NULL);
 		return (arc_buf_fill(buf, spa, zb, flags));
 	}
 
 	return (0);
 }
 
 static const char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
 {
 	atomic_add_64(&arc_loaned_bytes, delta);
 
 	/* assert that it did not wrap around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 }
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 {
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type, complevel);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
 	    complevel);
 
 	atomic_add_64(&arc_loaned_bytes, psize);
 	return (buf);
 }
 
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	arc_loaned_bytes_update(-arc_buf_size(buf));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 }
 
 static void
 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 static void
 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
 	if (free_rdata) {
 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
 	} else {
 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 	}
 }
 
 /*
  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
  * data buffer, we transfer the refcount ownership to the hdr and update
  * the appropriate kstats.
  */
 static void
 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_can_share(hdr, buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Start sharing the data buffer. We transfer the
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
 	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since we've transferred ownership to the hdr we need
 	 * to increment its compressed and uncompressed kstats and
 	 * decrement the overhead size.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
 }
 
 static void
 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
 	abd_free(hdr->b_l1hdr.b_pabd);
 	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since the buffer is no longer shared between
 	 * the arc buf and the hdr, count it as overhead.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 }
 
 /*
  * Remove an arc_buf_t from the hdr's buf list and return the last
  * arc_buf_t on the list. If no buffers remain on the list then return
  * NULL.
  */
 static arc_buf_t *
 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
 	arc_buf_t *lastbuf = NULL;
 
 	/*
 	 * Remove the buf from the hdr list and locate the last
 	 * remaining buffer on the list.
 	 */
 	while (*bufp != NULL) {
 		if (*bufp == buf)
 			*bufp = buf->b_next;
 
 		/*
 		 * If we've removed a buffer in the middle of
 		 * the list then update the lastbuf and update
 		 * bufp.
 		 */
 		if (*bufp != NULL) {
 			lastbuf = *bufp;
 			bufp = &(*bufp)->b_next;
 		}
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
 }
 
 /*
  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
  * list and free it.
  */
 static void
 arc_buf_destroy_impl(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Free up the data associated with the buf but only if we're not
 	 * sharing this with the hdr. If we are sharing it with the hdr, the
 	 * hdr is responsible for doing the free.
 	 */
 	if (buf->b_data != NULL) {
 		/*
 		 * We're about to change the hdr's b_flags. We must either
 		 * hold the hash_lock or be undiscoverable.
 		 */
 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
 		/*
 		 * If we have no more encrypted buffers and we've already
 		 * gotten a copy of the decrypted data we can free b_rabd
 		 * to save some space.
 		 */
 		if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
 		    hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
 			arc_buf_t *b;
 			for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
 				if (b != buf && ARC_BUF_ENCRYPTED(b))
 					break;
 			}
 			if (b == NULL)
 				arc_hdr_free_abd(hdr, B_TRUE);
 		}
 	}
 
 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
 		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
 		 * There is an equivalent case for compressed bufs, but since
 		 * they aren't guaranteed to be the last buf in the list and
 		 * that is an exceedingly rare case, we just allow that space be
 		 * wasted temporarily. We must also be careful not to share
 		 * encrypted buffers, since they cannot be shared.
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
 			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
 			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 			/*
 			 * We must setup a new shared block between the
 			 * last buffer and the hdr. The data would have
 			 * been allocated by the arc buf so we need to transfer
 			 * ownership to the hdr since it's now being shared.
 			 */
 			arc_share_buf(hdr, lastbuf);
 		}
 	} else if (HDR_SHARED_DATA(hdr)) {
 		/*
 		 * Uncompressed shared buffers are always at the end
 		 * of the list. Compressed buffers don't have the
 		 * same requirements. This makes it hard to
 		 * simply assert that the lastbuf is shared so
 		 * we rely on the hdr's compression flags to determine
 		 * if we have a compressed, shared buffer.
 		 */
 		ASSERT3P(lastbuf, !=, NULL);
 		ASSERT(arc_buf_is_shared(lastbuf) ||
 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	}
 
 	/*
 	 * Free the checksum if we're removing the last uncompressed buf from
 	 * this hdr.
 	 */
 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
 		arc_cksum_free(hdr);
 	}
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
 {
 	uint64_t size;
 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
 
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
 
 	if (alloc_rdata) {
 		size = HDR_GET_PSIZE(hdr);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
 		ARCSTAT_INCR(arcstat_raw_size, size);
 	} else {
 		size = arc_hdr_size(hdr);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ARCSTAT_INCR(arcstat_compressed_size, size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
 	 * we defer freeing the data by adding it to the l2arc_free_on_write
 	 * list. The l2arc will free the data once it's finished
 	 * writing it to the l2arc device.
 	 */
 	if (HDR_L2_WRITING(hdr)) {
 		arc_hdr_free_on_write(hdr, free_rdata);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else if (free_rdata) {
 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
 	} else {
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
 	}
 
 	if (free_rdata) {
 		hdr->b_crypt_hdr.b_rabd = NULL;
 		ARCSTAT_INCR(arcstat_raw_size, -size);
 	} else {
 		hdr->b_l1hdr.b_pabd = NULL;
 	}
 
 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 }
 
 /*
  * Allocate empty anonymous ARC header.  The header will get its identity
  * assigned and buffers attached later as part of read or write operations.
  *
  * In case of read arc_read() assigns header its identify (b_dva + b_birth),
  * inserts it into ARC hash to become globally visible and allocates physical
  * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk.  On disk read
  * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
  * sharing one of them with the physical ABD buffer.
  *
  * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
  * data.  Then after compression and/or encryption arc_write_ready() allocates
  * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
  * buffer.  On disk write completion arc_write_done() assigns the header its
  * new identity (b_dva + b_birth) and inserts into ARC hash.
  *
  * In case of partial overwrite the old data is read first as described. Then
  * arc_release() either allocates new anonymous ARC header and moves the ARC
  * buffer to it, or reuses the old ARC header by discarding its identity and
  * removing it from ARC hash.  After buffer modification normal write process
  * follows as described.
  */
 static arc_buf_hdr_t *
 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
     arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
 	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 
 	ASSERT(HDR_EMPTY(hdr));
 #ifdef ZFS_DEBUG
 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
 	arc_hdr_set_compress(hdr, compression_type);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
 
 	if (new == hdr_full_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 
 		/*
 		 * If we've reached here, We must have been called from
 		 * arc_evict_hdr(), as such we should have already been
 		 * removed from any ghost list we were previously on
 		 * (which protects us from racing with arc_evict_state),
 		 * thus no locking is needed during this check.
 		 */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 		ASSERT(!HDR_HAS_RABD(hdr));
 
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	/*
 	 * Since we're using the pointer address as the tag when
 	 * incrementing and decrementing the l2ad_alloc refcount, we
 	 * must remove the old pointer (that we're about to destroy) and
 	 * add the new pointer to the refcount. Otherwise we'd remove
 	 * the wrong pointer address when calling arc_hdr_destroy() later.
 	 */
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 	    arc_hdr_size(hdr), hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 	    arc_hdr_size(nhdr), nhdr);
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
  * its embedded MACs. Both block types will always be uncompressed so we do not
  * have to worry about compression type or psize.
  */
 void
 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
 	arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	if (!arc_hdr_has_uncompressed_buf(hdr))
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
 		memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
 		memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
 		memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
 arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
     int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 /*
  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
  * for bufs containing metadata.
  */
 arc_buf_t *
 arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
     uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	/*
 	 * To ensure that the hdr has the correct data in it if we call
 	 * arc_untransform() on this buf before it's been written to disk,
 	 * it's easiest if we just set up sharing between the buf and the hdr.
 	 */
 	arc_share_buf(hdr, buf);
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
     boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
 
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
 	    compression_type, complevel, type);
 
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
 	 * encrypted type. It will become authenticated instead in
 	 * arc_write_ready().
 	 */
 	buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 static void
 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	arc_buf_contents_t type = hdr->b_type;
 	int64_t lsize_s;
 	int64_t psize_s;
 	int64_t asize_s;
 
 	if (incr) {
 		lsize_s = lsize;
 		psize_s = psize;
 		asize_s = asize;
 	} else {
 		lsize_s = -lsize;
 		psize_s = -psize;
 		asize_s = -asize;
 	}
 
 	/* If the buffer is a prefetch, count it as such. */
 	if (HDR_PREFETCH(hdr)) {
 		ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
 	} else {
 		/*
 		 * We use the value stored in the L2 header upon initial
 		 * caching in L2ARC. This value will be updated in case
 		 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
 		 * metadata (log entry) cannot currently be updated. Having
 		 * the ARC state in the L2 header solves the problem of a
 		 * possibly absent L1 header (apparent in buffers restored
 		 * from persistent L2ARC).
 		 */
 		switch (hdr->b_l2hdr.b_arcs_state) {
 			case ARC_STATE_MRU_GHOST:
 			case ARC_STATE_MRU:
 				ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
 				break;
 			case ARC_STATE_MFU_GHOST:
 			case ARC_STATE_MFU:
 				ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
 				break;
 			default:
 				break;
 		}
 	}
 
 	if (state_only)
 		return;
 
 	ARCSTAT_INCR(arcstat_l2_psize, psize_s);
 	ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
 
 	switch (type) {
 		case ARC_BUFC_DATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
 			break;
 		case ARC_BUFC_METADATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
 			break;
 		default:
 			break;
 	}
 }
 
 
 static void
 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	l2arc_hdr_arcstats_decrement(hdr);
 	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
 		if (!buflist_held)
 			mutex_enter(&dev->l2ad_mtx);
 
 		/*
 		 * Even though we checked this conditional above, we
 		 * need to check this again now that we have the
 		 * l2ad_mtx. This is because we could be racing with
 		 * another thread calling l2arc_evict() which might have
 		 * destroyed this header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx. If that happens, we don't
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr)) {
 
 			if (!HDR_EMPTY(hdr))
 				buf_discard_identity(hdr);
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 
 		if (!buflist_held)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	/*
 	 * The header's identify can only be safely discarded once it is no
 	 * longer discoverable.  This requires removing it from the hash table
 	 * and the l2arc header list.  After this point the hash lock can not
 	 * be used to protect the header.
 	 */
 	if (!HDR_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_cksum_free(hdr);
 
 		while (hdr->b_l1hdr.b_buf != NULL)
 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 
 	ASSERT3P(hdr->b_hash_next, ==, NULL);
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 #ifdef ZFS_DEBUG
 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_destroy(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, tag));
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
 	arc_buf_destroy_impl(buf);
 	(void) remove_reference(hdr, tag);
 	mutex_exit(hash_lock);
 }
 
 /*
  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
  * state of the header is dependent on its state prior to entering this
  * function. The following transitions are possible:
  *
  *    - arc_mru -> arc_mru_ghost
  *    - arc_mfu -> arc_mfu_ghost
  *    - arc_mru_ghost -> arc_l2c_only
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
  *    - arc_uncached -> deleted
  *
  * Return total size of evicted data buffers for eviction progress tracking.
  * When evicting from ghost states return logical buffer size to make eviction
  * progress at the same (or at least comparable) rate as from non-ghost states.
  *
  * Return *real_evicted for actual ARC size reduction to wake up threads
  * waiting for it.  For non-ghost states it includes size of evicted data
  * buffers (the headers are not freed there).  For ghost states it includes
  * only the evicted headers size.
  */
 static int64_t
 arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
 	uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
 		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing its L1 piece) until the header is
 		 * done being written to the l2arc.
 		 */
 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
 			return (bytes_evicted);
 		}
 
 		ARCSTAT_BUMP(arcstat_deleted);
 		bytes_evicted += HDR_GET_LSIZE(hdr);
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
 			ASSERT(!HDR_HAS_RABD(hdr));
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
 			arc_change_state(arc_l2c_only, hdr);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
 			(void) arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
 
 	ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
 	evicted_state = (state == arc_uncached) ? arc_anon :
 	    ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
 
 	/* prefetch buffers have a minimum lifespan */
 	if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
 	    MSEC_TO_TICK(min_lifetime)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
 			    HDR_GET_LSIZE(hdr));
 
 			switch (state->arcs_state) {
 				case ARC_STATE_MRU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mru,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				case ARC_STATE_MFU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mfu,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				default:
 					break;
 			}
 		} else {
 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
 			    HDR_GET_LSIZE(hdr));
 		}
 	}
 
 	bytes_evicted += arc_hdr_size(hdr);
 	*real_evicted += arc_hdr_size(hdr);
 
 	/*
 	 * If this hdr is being evicted and has a compressed buffer then we
 	 * discard it here before we change states.  This ensures that the
 	 * accounting is updated correctly in arc_free_data_impl().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL)
 		arc_hdr_free_abd(hdr, B_FALSE);
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	arc_change_state(evicted_state, hdr);
 	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	if (evicted_state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		*real_evicted += HDR_FULL_SIZE;
 	} else {
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 	}
 
 	return (bytes_evicted);
 }
 
 static void
 arc_set_need_free(void)
 {
 	ASSERT(MUTEX_HELD(&arc_evict_lock));
 	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
 	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
 	if (aw == NULL) {
 		arc_need_free = MAX(-remaining, 0);
 	} else {
 		arc_need_free =
 		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
 	}
 }
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, uint64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint_t evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
 
 	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
 		if ((evict_count == 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
 		 * To keep our iteration location, move the marker
 		 * forward. Since we're not holding hdr's hash lock, we
 		 * must be very careful and not remove 'hdr' from the
 		 * sublist. Otherwise, other consumers might mistake the
 		 * 'hdr' as not being on a sublist when they call the
 		 * multilist_link_active() function (they all rely on
 		 * the hash lock protecting concurrent insertions and
 		 * removals). multilist_sublist_move_forward() was
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
 		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
 		 * zero, is the marker headers inserted by
 		 * arc_evict_state(). It's possible for multiple threads
 		 * to be calling arc_evict_state() concurrently (e.g.
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
 		if (hdr->b_spa == 0)
 			continue;
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We aren't calling this function from any code path
 		 * that would already be holding a hash lock, so we're
 		 * asserting on this assumption to be defensive in case
 		 * this ever changes. Without this check, it would be
 		 * possible to incorrectly increment arcstat_mutex_miss
 		 * below (e.g. if the code changed such that we called
 		 * this function with a hash lock held).
 		 */
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t revicted;
 			uint64_t evicted = arc_evict_hdr(hdr, &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
 			real_evicted += revicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
 			 * decided to skip this header, don't increment
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
 				evict_count--;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * Increment the count of evicted bytes, and wake up any threads that
 	 * are waiting for the count to reach this value.  Since the list is
 	 * ordered by ascending aew_count, we pop off the beginning of the
 	 * list until we reach the end, or a waiter that's past the current
 	 * "count".  Doing this outside the loop reduces the number of times
 	 * we need to acquire the global arc_evict_lock.
 	 *
 	 * Only wake when there's sufficient free memory in the system
 	 * (specifically, arc_sys_free/2, which by default is a bit more than
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_count += real_evicted;
 
 	if (arc_free_memory() > arc_sys_free / 2) {
 		arc_evict_waiter_t *aw;
 		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
 		    aw->aew_count <= arc_evict_count) {
 			list_remove(&arc_evict_waiters, aw);
 			cv_broadcast(&aw->aew_cv);
 		}
 	}
 	arc_set_need_free();
 	mutex_exit(&arc_evict_lock);
 
 	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
 	kpreempt(KPREEMPT_SYNC);
 
 	return (bytes_evicted);
 }
 
 static arc_buf_hdr_t *
 arc_state_alloc_marker(void)
 {
 	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
 	/*
 	 * A b_spa of 0 is used to indicate that this header is
 	 * a marker. This fact is used in arc_evict_state_impl().
 	 */
 	marker->b_spa = 0;
 
 	return (marker);
 }
 
 static void
 arc_state_free_marker(arc_buf_hdr_t *marker)
 {
 	kmem_cache_free(hdr_full_cache, marker);
 }
 
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
  */
 static arc_buf_hdr_t **
 arc_state_alloc_markers(int count)
 {
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
 	for (int i = 0; i < count; i++)
 		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
 static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
 		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
  * appropriate evict state.
  *
  * This function makes a "best effort". It skips over any buffers
  * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
  *
  * If bytes is specified using the special value ARC_EVICT_ALL, this
  * will evict all available (i.e. unlocked and evictable) buffers from
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
 arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
     uint64_t bytes)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 
 	num_sublists = multilist_get_num_sublists(ml);
 
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
 	 * to evict, we want to keep trying. The markers allow us to
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
 	if (zthr_iscurthread(arc_evict_zthr)) {
 		markers = arc_state_evict_markers;
 		ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
 	} else {
 		markers = arc_state_alloc_markers(num_sublists);
 	}
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
 		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes) {
 		int sublist_idx = multilist_get_random_index(ml);
 		uint64_t scan_evicted = 0;
 
 		/*
 		 * Start eviction using a randomly selected sublist,
 		 * this is to try and evenly balance eviction across all
 		 * sublists. Always starting at the same sublist
 		 * (e.g. index 0) would cause evictions to favor certain
 		 * sublists over others.
 		 */
 		for (int i = 0; i < num_sublists; i++) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
 			    markers[sublist_idx], spa, bytes_remaining);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
 
 			/* we've reached the end, wrap to the beginning */
 			if (++sublist_idx >= num_sublists)
 				sublist_idx = 0;
 		}
 
 		/*
 		 * If we didn't evict anything during this scan, we have
 		 * no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
 		if (scan_evicted == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);
 
 			/*
 			 * When bytes is ARC_EVICT_ALL, the only way to
 			 * break the loop is when scan_evicted is zero.
 			 * In that case, we actually have evicted enough,
 			 * so we don't want to increment the kstat.
 			 */
 			if (bytes != ARC_EVICT_ALL) {
 				ASSERT3S(total_evicted, <, bytes);
 				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
 			break;
 		}
 	}
 
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 	if (markers != arc_state_evict_markers)
 		arc_state_free_markers(markers, num_sublists);
 
 	return (total_evicted);
 }
 
 /*
  * Flush all "evictable" data of the given type from the arc state
  * specified. This will not evict any "active" buffers (i.e. referenced).
  *
  * When 'retry' is set to B_FALSE, the function will make a single pass
  * over the state and evict any buffers that it can. Since it doesn't
  * continually retry the eviction, it might end up leaving some buffers
  * in the ARC due to lock misses.
  *
  * When 'retry' is set to B_TRUE, the function will continually retry the
  * eviction until *all* evictable buffers have been removed from the
  * state. As a result, if concurrent insertions into the state are
  * allowed (e.g. if the ARC isn't shutting down), this function might
  * wind up in an infinite loop, continually trying to evict buffers.
  */
 static uint64_t
 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
 		evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
 
 		if (!retry)
 			break;
 	}
 
 	return (evicted);
 }
 
 /*
  * Evict the specified number of bytes from the state specified. This
  * function prevents us from trying to evict more from a state's list
  * than is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
 arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
 {
 	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
 		return (arc_evict_state(state, type, 0, delta));
 	}
 
 	return (0);
 }
 
 /*
  * Adjust specified fraction, taking into account initial ghost state(s) size,
  * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
  * decreasing it, plus a balance factor, controlling the decrease rate, used
  * to balance metadata vs data.
  */
 static uint64_t
 arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
     uint_t balance)
 {
 	if (total < 8 || up + down == 0)
 		return (frac);
 
 	/*
 	 * We should not have more ghost hits than ghost size, but they
 	 * may get close.  Restrict maximum adjustment in that case.
 	 */
 	if (up + down >= total / 4) {
 		uint64_t scale = (up + down) / (total / 8);
 		up /= scale;
 		down /= scale;
 	}
 
 	/* Get maximal dynamic range by choosing optimal shifts. */
 	int s = highbit64(total);
 	s = MIN(64 - s, 32);
 
 	uint64_t ofrac = (1ULL << 32) - frac;
 
 	if (frac >= 4 * ofrac)
 		up /= frac / (2 * ofrac + 1);
 	up = (up << s) / (total >> (32 - s));
 	if (ofrac >= 4 * frac)
 		down /= ofrac / (2 * frac + 1);
 	down = (down << s) / (total >> (32 - s));
 	down = down * 100 / balance;
 
 	return (frac + up - down);
 }
 
 /*
  * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
  */
 static uint64_t
 arc_evict(void)
 {
 	uint64_t asize, bytes, total_evicted = 0;
 	int64_t e, mrud, mrum, mfud, mfum, w;
 	static uint64_t ogrd, ogrm, ogfd, ogfm;
 	static uint64_t gsrd, gsrm, gsfd, gsfm;
 	uint64_t ngrd, ngrm, ngfd, ngfm;
 
 	/* Get current size of ARC states we can evict from. */
 	mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	uint64_t d = mrud + mfud;
 	uint64_t m = mrum + mfum;
 	uint64_t t = d + m;
 
 	/* Get ARC ghost hits since last eviction. */
 	ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t grd = ngrd - ogrd;
 	ogrd = ngrd;
 	ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t grm = ngrm - ogrm;
 	ogrm = ngrm;
 	ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t gfd = ngfd - ogfd;
 	ogfd = ngfd;
 	ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t gfm = ngfm - ogfm;
 	ogfm = ngfm;
 
 	/* Adjust ARC states balance based on ghost hits. */
 	arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
 	    grm + gfm, grd + gfd, zfs_arc_meta_balance);
 	arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
 	arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
 
 	asize = aggsum_value(&arc_sums.arcstat_size);
 	int64_t wt = t - (asize - arc_c);
 
 	/*
 	 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
 	 * target is not evictable or if they go over arc_dnode_limit.
 	 */
 	int64_t prune = 0;
 	int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size);
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
 	    zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) -
 	    zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) -
 	    zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]) >
 	    w * 3 / 4) {
 		prune = dn / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100;
 	} else if (dn > arc_dnode_limit) {
 		prune = (dn - arc_dnode_limit) / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100;
 	}
 	if (prune > 0)
 		arc_prune_async(prune);
 
 	/* Evict MRU metadata. */
 	w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
 	e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mrum -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU metadata. */
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w));
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mfum -= bytes;
 	asize -= bytes;
 
 	/* Evict MRU data. */
 	wt -= m - total_evicted;
 	w = wt * (int64_t)(arc_pd >> 16) >> 16;
 	e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
 	total_evicted += bytes;
 	mrud -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU data. */
 	e = asize - arc_c;
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
 	mfud -= bytes;
 	total_evicted += bytes;
 
 	/*
 	 * Evict ghost lists
 	 *
 	 * Size of each state's ghost list represents how much that state
 	 * may grow by shrinking the other states.  Would it need to shrink
 	 * other states to zero (that is unlikely), its ghost size would be
 	 * equal to sum of other three state sizes.  But excessive ghost
 	 * size may result in false ghost hits (too far back), that may
 	 * never result in real cache hits if several states are competing.
 	 * So choose some arbitraty point of 1/2 of other state sizes.
 	 */
 	gsrd = (mrum + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsrd;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
 
 	gsrm = (mrud + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsrm;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
 
 	gsfd = (mrud + mrum + mfum) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsfd;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
 
 	gsfm = (mrud + mrum + mfud) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsfm;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
 
 	return (total_evicted);
 }
 
 void
 arc_flush(spa_t *spa, boolean_t retry)
 {
 	uint64_t guid = 0;
 
 	/*
 	 * If retry is B_TRUE, a spa must not be specified since we have
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
 	ASSERT(!retry || spa == NULL);
 
 	if (spa != NULL)
 		guid = spa_load_guid(spa);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_reduce_target_size(int64_t to_free)
 {
 	uint64_t c = arc_c;
 
 	if (c <= arc_c_min)
 		return;
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
 	 * memory.  Therefore we reduce from the lower of the current size and
 	 * the target size.  This way, even if arc_c is much higher than
 	 * arc_size (as can be the case after many calls to arc_freed(), we will
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 	if (asize < c)
 		to_free += c - asize;
 	arc_c = MAX((int64_t)c - to_free, (int64_t)arc_c_min);
 
 	/* See comment in arc_evict_cb_check() on why lock+flag */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = B_TRUE;
 	mutex_exit(&arc_evict_lock);
 	zthr_wakeup(arc_evict_zthr);
 }
 
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of B_TRUE indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */
 boolean_t
 arc_reclaim_needed(void)
 {
 	return (arc_available_memory() < 0);
 }
 
 void
 arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
 #ifdef _KERNEL
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 #if defined(_ILP32)
 		/* reach upper limit of cache size on 32-bit */
 		if (zio_buf_cache[i] == NULL)
 			break;
 #endif
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 	kmem_cache_reap_now(zfs_btree_leaf_cache);
 	abd_cache_reap_now();
 }
 
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
 	 * with this call, the data might be out of date if the
 	 * evict thread hasn't been woken recently; but that should
 	 * suffice.  The arc_state_t structures can be queried
 	 * directly if more accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 #endif
 
 	/*
 	 * We have to rely on arc_wait_for_eviction() to tell us when to
 	 * evict, rather than checking if we are overflowing here, so that we
 	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
 	 * If we have become "not overflowing" since arc_wait_for_eviction()
 	 * checked, we need to wake it up.  We could broadcast the CV here,
 	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
 	 * would need to use a mutex to ensure that this function doesn't
 	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
 	 * the arc_evict_lock).  However, the lock ordering of such a lock
 	 * would necessarily be incorrect with respect to the zthr_lock,
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	if (arc_evict_needed)
 		return (B_TRUE);
 
 	/*
 	 * If we have buffers in uncached state, evict them periodically.
 	 */
 	return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
 	    ddi_get_lbolt() - arc_last_uncached_flush >
 	    MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/* Always try to evict from uncached state. */
 	arc_last_uncached_flush = ddi_get_lbolt();
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
 
 	/* Evict from other states only if told to. */
 	if (arc_evict_needed)
 		evicted += arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_evict(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
 		 * We're either no longer overflowing, or we
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
 		arc_evict_waiter_t *aw;
 		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
 			cv_broadcast(&aw->aew_cv);
 		}
 		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	int64_t free_memory = arc_available_memory();
 	static int reap_cb_check_counter = 0;
 
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
 		 * Wait at least zfs_grow_retry (default 5) seconds
 		 * before considering growing.
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
 	}
 
 	/*
 	 * Called unconditionally every 60 seconds to reclaim unused
 	 * zstd compression and decompression context. This is done
 	 * here to avoid the need for an independent thread.
 	 */
 	if (!((reap_cb_check_counter++) % 60))
 		zfs_zstd_cache_reap_now();
 
 	return (B_FALSE);
 }
 
 /*
  * Keep enough free memory in the system by reaping the ARC's kmem
  * caches.  To cause more slabs to be reapable, we may reduce the
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	int64_t free_memory;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less than the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 
 	int64_t can_free = arc_c - arc_c_min;
 	if (can_free > 0) {
 		int64_t to_free = (can_free >> arc_shrink_shift) - free_memory;
 		if (to_free > 0)
 			arc_reduce_target_size(to_free);
 	}
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are coming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(uint64_t bytes)
 {
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
 		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	if (aggsum_upper_bound(&arc_sums.arcstat_size) +
 	    2 * SPA_MAXBLOCKSIZE >= arc_c) {
 		uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
 		if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
 			arc_c = arc_c_max;
 	}
 }
 
 /*
  * Check if arc_size has grown past our upper threshold, determined by
  * zfs_arc_overflow_shift.
  */
 static arc_ovf_level_t
 arc_is_overflowing(boolean_t use_reserve)
 {
 	/* Always allow at least one block of overflow */
 	int64_t overflow = MAX(SPA_MAXBLOCKSIZE,
 	    arc_c >> zfs_arc_overflow_shift);
 
 	/*
 	 * We just compare the lower bound here for performance reasons. Our
 	 * primary goals are to make sure that the arc never grows without
 	 * bound, and that it can reach its maximum size. This check
 	 * accomplishes both goals. The maximum amount we could run over by is
 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
 	int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
 	    arc_c - overflow / 2;
 	if (!use_reserve)
 		overflow /= 2;
 	return (over < 0 ? ARC_OVF_NONE :
 	    over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, alloc_flags);
 	if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
 		return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
 	else
 		return (abd_alloc(size, type == ARC_BUFC_METADATA));
 }
 
 static void *
 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, 0);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (zio_data_buf_alloc(size));
 	}
 }
 
 /*
  * Wait for the specified amount of data (in bytes) to be evicted from the
  * ARC, and for there to be sufficient free memory in the system.  Waiting for
  * eviction ensures that the memory used by the ARC decreases.  Waiting for
  * free memory ensures that the system won't run out of free pages, regardless
  * of ARC behavior and settings.  See arc_lowmem_init().
  */
 void
 arc_wait_for_eviction(uint64_t amount, boolean_t use_reserve)
 {
 	switch (arc_is_overflowing(use_reserve)) {
 	case ARC_OVF_NONE:
 		return;
 	case ARC_OVF_SOME:
 		/*
 		 * This is a bit racy without taking arc_evict_lock, but the
 		 * worst that can happen is we either call zthr_wakeup() extra
 		 * time due to race with other thread here, or the set flag
 		 * get cleared by arc_evict_cb(), which is unlikely due to
 		 * big hysteresis, but also not important since at this level
 		 * of overflow the eviction is purely advisory.  Same time
 		 * taking the global lock here every time without waiting for
 		 * the actual eviction creates a significant lock contention.
 		 */
 		if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		return;
 	case ARC_OVF_SEVERE:
 	default:
 	{
 		arc_evict_waiter_t aw;
 		list_link_init(&aw.aew_node);
 		cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
 
 		uint64_t last_count = 0;
 		mutex_enter(&arc_evict_lock);
 		if (!list_is_empty(&arc_evict_waiters)) {
 			arc_evict_waiter_t *last =
 			    list_tail(&arc_evict_waiters);
 			last_count = last->aew_count;
 		} else if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		/*
 		 * Note, the last waiter's count may be less than
 		 * arc_evict_count if we are low on memory in which
 		 * case arc_evict_state_impl() may have deferred
 		 * wakeups (but still incremented arc_evict_count).
 		 */
 		aw.aew_count = MAX(last_count, arc_evict_count) + amount;
 
 		list_insert_tail(&arc_evict_waiters, &aw);
 
 		arc_set_need_free();
 
 		DTRACE_PROBE3(arc__wait__for__eviction,
 		    uint64_t, amount,
 		    uint64_t, arc_evict_count,
 		    uint64_t, aw.aew_count);
 
 		/*
 		 * We will be woken up either when arc_evict_count reaches
 		 * aew_count, or when the ARC is no longer overflowing and
 		 * eviction completes.
 		 * In case of "false" wakeup, we will still be on the list.
 		 */
 		do {
 			cv_wait(&aw.aew_cv, &arc_evict_lock);
 		} while (list_link_active(&aw.aew_node));
 		mutex_exit(&arc_evict_lock);
 
 		cv_destroy(&aw.aew_cv);
 	}
 	}
 }
 
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_adapt(size);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
 	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
 	 * further past it's target size, we wait for the eviction thread to
 	 * make some progress.  We also wait for there to be sufficient free
 	 * memory in the system, as measured by arc_free_memory().
 	 *
 	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 */
 	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
 	    alloc_flags & ARC_HDR_USE_RESERVE);
 
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	if (!GHOST_STATE(state)) {
 
 		(void) zfs_refcount_add_many(&state->arcs_size[type], size,
 		    tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
 		 * protected by the hash lock. If reached via
 		 * arc_buf_alloc, the header should not be accessed by
 		 * any other thread. And, if reached via arc_read_done,
 		 * the hash lock will protect it if it's found in the
 		 * hash table; otherwise no other thread should be
 		 * trying to [add|remove]_reference it.
 		 */
 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
 	}
 }
 
 static void
 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
     const void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_free_data_impl(hdr, size, tag);
 	if (type == ARC_BUFC_METADATA) {
 		zio_buf_free(buf, size);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		zio_data_buf_free(buf, size);
 	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
 {
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * Update buffer prefetch status.
 	 */
 	boolean_t was_prefetch = HDR_PREFETCH(hdr);
 	boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
 	if (was_prefetch != now_prefetch) {
 		if (was_prefetch) {
 			ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
 			    HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
 			    prefetch);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_decrement_state(hdr);
 		if (was_prefetch) {
 			arc_hdr_clear_flags(hdr,
 			    ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
 		} else {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_increment_state(hdr);
 	}
 	if (now_prefetch) {
 		if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 			ARCSTAT_BUMP(arcstat_prescient_prefetch);
 		} else {
 			ARCSTAT_BUMP(arcstat_predictive_prefetch);
 		}
 	}
 	if (arc_flags & ARC_FLAG_L2CACHE)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	clock_t now = ddi_get_lbolt();
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer is not in the cache, and does not appear in
 		 * our "ghost" lists.  Add it to the MRU or uncached state.
 		 */
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = now;
 		if (HDR_UNCACHED(hdr)) {
 			new_state = arc_uncached;
 			DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
 			    hdr);
 		} else {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		/*
 		 * This buffer has been accessed once recently and either
 		 * its read is still in progress or it is in the cache.
 		 */
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 		hdr->b_l1hdr.b_mru_hits++;
 		ARCSTAT_BUMP(arcstat_mru_hits);
 
 		/*
 		 * If the previous access was a prefetch, then it already
 		 * handled possible promotion, so nothing more to do for now.
 		 */
 		if (was_prefetch) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * If more than ARC_MINTIME have passed from the previous
 		 * hit, promote the buffer to the MFU state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr);
 		}
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been accessed once recently, but was
 		 * evicted from the cache.  Would we have bigger MRU, it
 		 * would be an MRU hit, so handle it the same way, except
 		 * we don't need to check the previous access time.
 		 */
 		hdr->b_l1hdr.b_mru_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		if (was_prefetch) {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and either
 		 * still in the cache or being restored from one of ghosts.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_mfu_hits++;
 			ARCSTAT_BUMP(arcstat_mfu_hits);
 		}
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		/*
 		 * This buffer has been accessed more than once recently, but
 		 * has been evicted from the cache.  Would we have bigger MFU
 		 * it would stay in cache, so move it back to MFU state.
 		 */
 		hdr->b_l1hdr.b_mfu_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_uncached) {
 		/*
 		 * This buffer is uncacheable, but we got a hit.  Probably
 		 * a demand read after prefetch.  Nothing more to do here.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr))
 			ARCSTAT_BUMP(arcstat_uncached_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC and was not accessed
 		 * for a long time, so treat it as new and put into MRU.
 		 */
 		hdr->b_l1hdr.b_arc_access = now;
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /*
  * This routine is called by dbuf_hold() to update the arc_access() state
  * which otherwise would be skipped for entries in the dbuf cache.
  */
 void
 arc_buf_access(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Avoid taking the hash_lock when possible as an optimization.
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
 		return;
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu ||
 	    hdr->b_l1hdr.b_state == arc_uncached);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, 0, B_TRUE);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
 	    !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zio, (void) zb, (void) bp;
 
 	if (buf == NULL)
 		return;
 
 	memcpy(arg, buf->b_data, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zb, (void) bp;
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
 		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
 		ASSERT(buf->b_data != NULL);
 	}
 }
 
 static void
 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
 	} else {
 		if (HDR_COMPRESSION_ENABLED(hdr)) {
 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
 			    BP_GET_COMPRESS(bp));
 		}
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	blkptr_t 	*bp = zio->io_bp;
 	arc_buf_hdr_t	*hdr = zio->io_private;
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
 		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
 
 		ASSERT((found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
 		if (zio->io_error == 0) {
 			if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 				void *tmpbuf;
 
 				tmpbuf = abd_borrow_buf_copy(zio->io_abd,
 				    sizeof (zil_chain_t));
 				zio_crypt_decode_mac_zil(tmpbuf,
 				    hdr->b_crypt_hdr.b_mac);
 				abd_return_buf(zio->io_abd, tmpbuf,
 				    sizeof (zil_chain_t));
 			} else {
 				zio_crypt_decode_mac_bp(bp,
 				    hdr->b_crypt_hdr.b_mac);
 			}
 		}
 	}
 
 	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 		if (!HDR_L2_READING(hdr)) {
 			hdr->b_complevel = zio->io_prop.zp_complevel;
 		}
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 	hdr->b_l1hdr.b_acb = NULL;
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
 	 * make a buf containing the data according to the parameters which were
 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
 	 * aren't needlessly decompressing the data multiple times.
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
 
 		/* We need the last one to call below in original order. */
 		callback_list = acb;
 
 		if (!acb->acb_done || acb->acb_nobuf)
 			continue;
 
 		callback_cnt++;
 
 		if (zio->io_error != 0)
 			continue;
 
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
 		 * encryption key wasn't loaded
 		 */
 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
 		    error != EACCES);
 
 		/*
 		 * If we failed to decrypt, report an error now (as the zio
 		 * layer would have done if it had done the transforms).
 		 */
 		if (error == ECKSUM) {
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
 				    &zio->io_bp->blk_birth);
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
 			}
 		}
 
 		if (error != 0) {
 			/*
 			 * Decompression or decryption failed.  Set
 			 * io_error so that when we call acb_done
 			 * (below), we will indicate that the read
 			 * failed. Note that in the unusual case
 			 * where one callback is compressed and another
 			 * uncompressed, we will mark all of them
 			 * as failed, even though the uncompressed
 			 * one can't actually fail.  In this case,
 			 * the hdr will not be anonymous, because
 			 * if there are multiple callbacks, it's
 			 * because multiple threads found the same
 			 * arc buf in the hash table.
 			 */
 			zio->io_error = error;
 		}
 	}
 
 	/*
 	 * If there are multiple callbacks, we must have the hash lock,
 	 * because the only way for multiple threads to find this hdr is
 	 * in the hash table.  This ensures that if there are multiple
 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
 	 * we couldn't use arc_buf_destroy() in the error case below.
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	(void) remove_reference(hdr, hdr);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done != NULL) {
 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
 				/*
 				 * If arc_buf_alloc_impl() fails during
 				 * decompression, the buf will still be
 				 * allocated, and needs to be freed here.
 				 */
 				arc_buf_destroy(acb->acb_buf,
 				    acb->acb_private);
 				acb->acb_buf = NULL;
 			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_prev;
 		if (acb->acb_wait) {
 			mutex_enter(&acb->acb_wait_lock);
 			acb->acb_wait_error = zio->io_error;
 			acb->acb_wait = B_FALSE;
 			cv_signal(&acb->acb_wait_cv);
 			mutex_exit(&acb->acb_wait_lock);
 			/* acb will be freed by the waiting thread. */
 		} else {
 			kmem_free(acb, sizeof (arc_callback_t));
 		}
 	}
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
 	arc_buf_t *buf = NULL;
 	int rc = 0;
 
 	ASSERT(!embedded_bp ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * Normally SPL_FSTRANS will already be set since kernel threads which
 	 * expect to call the DMU interfaces will set it when created.  System
 	 * calls are similarly handled by setting/cleaning the bit in the
 	 * registered callback (module/os/.../zfs/zpl_*).
 	 *
 	 * External consumers such as Lustre which call the exported DMU
 	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
 	 * on the hash_lock always set and clear the bit.
 	 */
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 top:
 	/*
 	 * Verify the block pointer contents are reasonable.  This should
 	 * always be the case since the blkptr is protected by a checksum.
 	 * However, if there is damage it's desirable to detect this early
 	 * and treat it as a checksum error.  This allows an alternate blkptr
 	 * to be tried when one is available (e.g. ditto blocks).
 	 */
 	if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
 	    BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		rc = SET_ERROR(ECKSUM);
 		goto done;
 	}
 
 	if (!embedded_bp) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	/*
 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
 	 * we maintain encrypted data separately from compressed / uncompressed
 	 * data. If the user is requesting raw encrypted data and we don't have
 	 * that in the header we will read from disk to guarantee that we can
 	 * get it even if the encryption keys aren't loaded.
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
 				goto done;
 			}
 
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
 				/*
 				 * This is a sync read that needs to wait for
 				 * an in-flight async read. Request that the
 				 * zio have its priority upgraded.
 				 */
 				zio_change_priority(head_zio, priority);
 				DTRACE_PROBE1(arc__async__upgrade__sync,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
 
 			DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
 			arc_access(hdr, *arc_flags, B_FALSE);
 
 			/*
 			 * If there are multiple threads reading the same block
 			 * and that block is not yet in the ARC, then only one
 			 * thread will do the physical I/O and all other
 			 * threads will wait until that I/O completes.
 			 * Synchronous reads use the acb_wait_cv whereas nowait
 			 * reads register a callback. Both are signalled/called
 			 * in arc_read_done.
 			 *
 			 * Errors of the physical I/O may need to be propagated.
 			 * Synchronous read errors are returned here from
 			 * arc_read_done via acb_wait_error.  Nowait reads
 			 * attach the acb_zio_dummy zio to pio and
 			 * arc_read_done propagates the physical I/O's io_error
 			 * to acb_zio_dummy, and thereby to pio.
 			 */
 			arc_callback_t *acb = NULL;
 			if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				acb->acb_compressed = compressed_read;
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_nobuf = no_buf;
 				if (*arc_flags & ARC_FLAG_WAIT) {
 					acb->acb_wait = B_TRUE;
 					mutex_init(&acb->acb_wait_lock, NULL,
 					    MUTEX_DEFAULT, NULL);
 					cv_init(&acb->acb_wait_cv, NULL,
 					    CV_DEFAULT, NULL);
 				}
 				acb->acb_zb = *zb;
 				if (pio != NULL) {
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 				}
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
 
 			ARCSTAT_BUMP(arcstat_iohits);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, is_data, data, metadata, iohits);
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				rc = acb->acb_wait_error;
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 			}
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu ||
 		    hdr->b_l1hdr.b_state == arc_uncached);
 
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, *arc_flags, B_TRUE);
 
 		if (done && !no_buf) {
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
 			    encrypted_read, compressed_read, noauth_read,
 			    B_TRUE, &buf);
 			if (rc == ECKSUM) {
 				/*
 				 * Convert authentication and decryption errors
 				 * to EIO (and generate an ereport if needed)
 				 * before leaving the ARC.
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 					spa_log_error(spa, zb, &hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
 				}
 			}
 			if (rc != 0) {
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
 				(void) remove_reference(hdr, private);
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
 		}
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 		    demand, prefetch, is_data, data, metadata, hits);
 		*arc_flags |= ARC_FLAG_CACHED;
 		goto done;
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 		arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			rc = SET_ERROR(ENOENT);
 			goto done;
 		}
 
 		if (hdr == NULL) {
 			/*
 			 * This block is not in the cache or it has
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
 			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
 				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
 			 * was requested and we didn't have it. If it was
 			 * L2-only (and thus didn't have an L1 hdr),
 			 * we realloc the header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
 				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 				ASSERT(!HDR_HAS_RABD(hdr));
 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
 #ifdef ZFS_DEBUG
 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
 #endif
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
 				 * and we are performing another IO to fetch
 				 * encrypted data we must wait until the first
 				 * IO completes so as not to confuse
 				 * arc_read_done(). This should be very rare
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
 				arc_callback_t *acb = kmem_zalloc(
 				    sizeof (arc_callback_t), KM_SLEEP);
 				acb->acb_wait = B_TRUE;
 				mutex_init(&acb->acb_wait_lock, NULL,
 				    MUTEX_DEFAULT, NULL);
 				cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
 				    NULL);
 				acb->acb_zio_head =
 				    hdr->b_l1hdr.b_acb->acb_zio_head;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 				goto top;
 			}
 		}
 		if (*arc_flags & ARC_FLAG_UNCACHED) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 			if (!encrypted_read)
 				alloc_flags |= ARC_HDR_ALLOC_LINEAR;
 		}
 
 		/*
 		 * Take additional reference for IO_IN_PROGRESS.  It stops
 		 * arc_access() from putting this header without any buffers
 		 * and so other references but obviously nonevictable onto
 		 * the evictable list of MRU or MFU state.
 		 */
 		add_reference(hdr, hdr);
 		if (!embedded_bp)
 			arc_access(hdr, *arc_flags, B_FALSE);
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		arc_hdr_alloc_abd(hdr, alloc_flags);
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			size = HDR_GET_PSIZE(hdr);
 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
 			zio_flags |= ZIO_FLAG_RAW;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			size = arc_hdr_size(hdr);
 			hdr_abd = hdr->b_l1hdr.b_pabd;
 
 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			}
 
 			/*
 			 * For authenticated bp's, we do not ask the ZIO layer
 			 * to authenticate them since this will cause the entire
 			 * IO to fail if the key isn't loaded. Instead, we
 			 * defer authentication until arc_buf_fill(), which will
 			 * verify the data when the key is available.
 			 */
 			if (BP_IS_AUTHENTICATED(bp))
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 		acb->acb_compressed = compressed_read;
 		acb->acb_encrypted = encrypted_read;
 		acb->acb_noauth = noauth_read;
 		acb->acb_zb = *zb;
 
 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 		hdr->b_l1hdr.b_acb = acb;
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		/*
 		 * We count both async reads and scrub IOs as asynchronous so
 		 * that both can be upgraded in the event of a cache hit while
 		 * the read IO is still in-flight.
 		 */
 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
 		    priority == ZIO_PRIORITY_SCRUB)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 		else
 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
 		/*
 		 * At this point, we have a level 1 cache miss or a blkptr
 		 * with embedded data.  Try again in L2ARC if possible.
 		 */
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
 		/*
 		 * Skip ARC stat bump for block pointers with embedded
 		 * data. The data are read from the blkptr itself via
 		 * decode_embedded_bp_compressed().
 		 */
 		if (!embedded_bp) {
 			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 			zfs_racct_read(size, 1);
 		}
 
 		/* Check if the spa even has l2 configured */
 		const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
 		    spa->spa_l2cache.sav_count > 0;
 
 		if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				hdr->b_l2hdr.b_hits++;
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
 				/*
 				 * When Compressed ARC is disabled, but the
 				 * L2ARC block is compressed, arc_hdr_size()
 				 * will have returned LSIZE rather than PSIZE.
 				 */
 				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 				    !HDR_COMPRESSION_ENABLED(hdr) &&
 				    HDR_GET_PSIZE(hdr) != 0) {
 					size = HDR_GET_PSIZE(hdr);
 				}
 
 				asize = vdev_psize_to_asize(vd, size);
 				if (asize != size) {
 					abd = abd_alloc_for_io(asize,
 					    HDR_ISTYPE_METADATA(hdr));
 					cb->l2rcb_abd = abd;
 				} else {
 					abd = hdr_abd;
 				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + asize <= vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
 
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes,
 				    HDR_GET_PSIZE(hdr));
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 
 			/*
 			 * Only a spa with l2 should contribute to l2
 			 * miss stats.  (Including the case of having a
 			 * faulted cache device - that's also a miss.)
 			 */
 			if (spa_has_l2) {
 				/*
 				 * Skip ARC stat bump for block pointers with
 				 * embedded data. The data are read from the
 				 * blkptr itself via
 				 * decode_embedded_bp_compressed().
 				 */
 				if (!embedded_bp) {
 					DTRACE_PROBE1(l2arc__miss,
 					    arc_buf_hdr_t *, hdr);
 					ARCSTAT_BUMP(arcstat_l2_misses);
 				}
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 		acb->acb_zio_head = rzio;
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	/* embedded bps don't actually go to disk */
 	if (!embedded_bp)
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
 
 done:
 	if (done)
 		done(NULL, zb, bp, buf, private);
 	if (pio && rc != 0) {
 		zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
 		zio->io_error = rc;
 		zio_nowait(zio);
 	}
 	goto out;
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	zfs_refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	boolean_t wait = B_FALSE;
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
 		wait = B_TRUE;
 	mutex_exit(&arc_prune_mtx);
 
 	/* wait for arc_prune_task to finish */
 	if (wait)
 		taskq_wait_outstanding(arc_prune_taskq, 0);
 	ASSERT0(zfs_refcount_count(&p->p_refcnt));
 	zfs_refcount_destroy(&p->p_refcnt);
 	kmem_free(p, sizeof (*p));
 }
 
 /*
  * Helper function for arc_prune_async() it is responsible for safely
  * handling the execution of a registered arc_prune_func_t.
  */
 static void
 arc_prune_task(void *ptr)
 {
 	arc_prune_t *ap = (arc_prune_t *)ptr;
 	arc_prune_func_t *func = ap->p_pfunc;
 
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 
 	zfs_refcount_remove(&ap->p_refcnt, func);
 }
 
 /*
  * Notify registered consumers they must drop holds on a portion of the ARC
  * buffers they reference.  This provides a mechanism to ensure the ARC can
  * honor the metadata limit and reclaim otherwise pinned ARC buffers.
  *
  * This operation is performed asynchronously so it may be safely called
  * in the context of the arc_reclaim_thread().  A reference is taken here
  * for each registered arc_prune_t and the arc_prune_task() is responsible
  * for releasing it once the registered arc_prune_func_t has completed.
  */
 static void
 arc_prune_async(uint64_t adjust)
 {
 	arc_prune_t *ap;
 
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
 
 		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
 			continue;
 
 		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
 		ap->p_adjust = adjust;
 		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
 		    ap, TQ_SLEEP) == TASKQID_INVALID) {
 			zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
 			continue;
 		}
 		ARCSTAT_BUMP(arcstat_prune);
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
 	 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
 	 * dmu_sync-ed block). A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
 	 * state and discoverable. When the txg finally syncs it detects that
 	 * the block was overridden in open context and issues an override I/O.
 	 * Since this is a dedup block, the override I/O will determine if the
 	 * block is already in the DDT. If so, then it will replace the io_bp
 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
 	 * reaches the done callback, dbuf_write_override_done, it will
 	 * check to see if the io_bp and io_bp_override are identical.
 	 * If they are not, then it indicates that the bp was replaced with
 	 * the bp in the DDT and the override bp is freed. This allows
 	 * us to arrive here with a reference on a block that is being
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
 	if (!HDR_HAS_L1HDR(hdr) ||
 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		/*
 		 * If the buf is being overridden then it may already
 		 * have a hdr that is not empty.
 		 */
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 
 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 
 		/*
 		 * We have to recheck this conditional again now that
 		 * we're holding the l2ad_mtx to prevent a race with
 		 * another thread which might be concurrently calling
 		 * l2arc_evict(). In that case, l2arc_evict() might have
 		 * destroyed the header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx.
 		 */
 		if (HDR_HAS_L2HDR(hdr))
 			arc_hdr_l2hdr_destroy(hdr);
 
 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 	}
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
 		uint64_t lsize = HDR_GET_LSIZE(hdr);
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
 		VERIFY3U(hdr->b_type, ==, type);
 
 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
 		VERIFY3S(remove_reference(hdr, tag), >, 0);
 
 		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
 
 		/*
 		 * Pull the data off of this hdr and attach it to
 		 * a new anonymous hdr. Also find the last buffer
 		 * in the hdr's buffer list.
 		 */
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
 			 * buf and the arc_buf_hdr_t.
 			 */
 			arc_unshare_buf(hdr, buf);
 
 			/*
 			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
 			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
 				arc_hdr_alloc_abd(hdr, 0);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
 			VERIFY3P(lastbuf->b_data, !=, NULL);
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
 			 * of the list. Compressed buffers don't have the
 			 * same requirements. This makes it hard to
 			 * simply assert that the lastbuf is shared so
 			 * we rely on the hdr's compression flags to determine
 			 * if we have a compressed, shared buffer.
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 		ASSERT3P(state, !=, arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_size[type],
 		    arc_buf_size(buf), buf);
 
 		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 			ASSERT3P(state, !=, arc_l2c_only);
 			(void) zfs_refcount_remove_many(
 			    &state->arcs_esize[type],
 			    arc_buf_size(buf), buf);
 		}
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		/* if this is the last uncompressed buf free the checksum */
 		if (!arc_hdr_has_uncompressed_buf(hdr))
 			arc_cksum_free(hdr);
 
 		mutex_exit(hash_lock);
 
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type);
 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
 		    arc_buf_size(buf), buf);
 	} else {
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		arc_change_state(arc_anon, hdr);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	return (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
 	 * cleanup any state that was previously set the first time the
 	 * callback was invoked.
 	 */
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 	ASSERT(!HDR_HAS_RABD(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	if (HDR_IO_IN_PROGRESS(hdr)) {
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
 	}
 
 	/*
 	 * If this block was written for raw encryption but the zio layer
 	 * ended up only authenticating it, adjust the buffer flags now.
 	 */
 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	}
 
 	/* this must be done after the buffer flags are adjusted */
 	arc_cksum_compute(buf);
 
 	enum zio_compress compress;
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		compress = ZIO_COMPRESS_OFF;
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		compress = BP_GET_COMPRESS(bp);
 	}
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = zio->io_prop.zp_complevel;
 
 	if (zio->io_error != 0 || psize == 0)
 		goto out;
 
 	/*
 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
 	 * but to copy the data into b_radb. If the hdr is compressed, the data
 	 * we want is available from the zio, otherwise we can take it from
 	 * the buf.
 	 *
 	 * We might be able to share the buf's data with the hdr here. However,
 	 * doing so would cause the ARC to be full of linear ABDs if we write a
 	 * lot of shareable data. As a compromise, we check whether scattered
 	 * ABDs are allowed, and assume that if they are then the user wants
 	 * the ARC to be primarily filled with them regardless of the data being
 	 * written. Therefore, if they're allowed then we allocate one and copy
 	 * the data into it; otherwise, we share the data directly if we can.
 	 */
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 		    ARC_HDR_USE_RESERVE);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 	} else if (!(HDR_UNCACHED(hdr) ||
 	    abd_size_alloc_linear(arc_buf_size(buf))) ||
 	    !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
 		 * user may have disabled compressed ARC, thus we must check the
 		 * hdr's compression setting rather than the io_bp's.
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 			    ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 
 		arc_share_buf(hdr, buf);
 	}
 
 out:
 	arc_hdr_verify(hdr, bp);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT3U(zio->io_error, ==, 0);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 				ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, 0, B_FALSE);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 	}
 
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_free(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
     arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
     void *private, zio_priority_t priority, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	if (uncached)
 		arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 	else if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		localprop.zp_encrypt = B_TRUE;
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 		memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
 		    ZIO_DATA_SALT_LEN);
 		memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
 		    ZIO_DATA_IV_LEN);
 		memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	/*
 	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 	}
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	if (!(zio_flags & ZIO_FLAG_RAW))
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 
 	ASSERT(!arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (!arc_no_grow &&
 	    reserve > arc_c/4 &&
 	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
 	anon_size = MAX((int64_t)
 	    (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
 	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 *
 	 * In the case of one pool being built on another pool, we want
 	 * to make sure we don't end up throttling the lower (backing)
 	 * pool when the upper pool is the majority contributor to dirty
 	 * data. To insure we make forward progress during throttling, we
 	 * also check the current pool's net dirty data and only throttle
 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
 	 * data in the cache.
 	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 	uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
 	if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
 	    anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 #ifdef ZFS_DEBUG
 		uint64_t meta_esize = zfs_refcount_count(
 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
 		    (u_longlong_t)arc_tempreserve >> 10,
 		    (u_longlong_t)meta_esize >> 10,
 		    (u_longlong_t)data_esize >> 10,
 		    (u_longlong_t)reserve >> 10,
 		    (u_longlong_t)rarc_c >> 10);
 #endif
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *data, kstat_named_t *metadata,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
 	metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
 	size->value.ui64 = data->value.ui64 + metadata->value.ui64;
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	as->arcstat_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hits);
 	as->arcstat_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_iohits);
 	as->arcstat_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_misses);
 	as->arcstat_demand_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_hits);
 	as->arcstat_demand_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_iohits);
 	as->arcstat_demand_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_misses);
 	as->arcstat_demand_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
 	as->arcstat_demand_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
 	as->arcstat_demand_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
 	as->arcstat_prefetch_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
 	as->arcstat_prefetch_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
 	as->arcstat_prefetch_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
 	as->arcstat_prefetch_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
 	as->arcstat_prefetch_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
 	as->arcstat_prefetch_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
 	as->arcstat_mru_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_hits);
 	as->arcstat_mru_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
 	as->arcstat_mfu_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_hits);
 	as->arcstat_mfu_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
 	as->arcstat_uncached_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncached_hits);
 	as->arcstat_deleted.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_deleted);
 	as->arcstat_mutex_miss.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mutex_miss);
 	as->arcstat_access_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_access_skip);
 	as->arcstat_evict_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_skip);
 	as->arcstat_evict_not_enough.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_not_enough);
 	as->arcstat_evict_l2_cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_cached);
 	as->arcstat_evict_l2_eligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
 	as->arcstat_evict_l2_eligible_mfu.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	as->arcstat_evict_l2_eligible_mru.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
 	as->arcstat_evict_l2_ineligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
 	as->arcstat_evict_l2_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_skip);
 	as->arcstat_hash_collisions.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_collisions);
 	as->arcstat_hash_chains.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_chains);
 	as->arcstat_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_size);
 	as->arcstat_compressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_compressed_size);
 	as->arcstat_uncompressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncompressed_size);
 	as->arcstat_overhead_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_overhead_size);
 	as->arcstat_hdr_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hdr_size);
 	as->arcstat_data_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_data_size);
 	as->arcstat_metadata_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_metadata_size);
 	as->arcstat_dbuf_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
 	    wmsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
 	arc_kstat_update_state(arc_anon,
 	    &as->arcstat_anon_size,
 	    &as->arcstat_anon_data,
 	    &as->arcstat_anon_metadata,
 	    &as->arcstat_anon_evictable_data,
 	    &as->arcstat_anon_evictable_metadata);
 	arc_kstat_update_state(arc_mru,
 	    &as->arcstat_mru_size,
 	    &as->arcstat_mru_data,
 	    &as->arcstat_mru_metadata,
 	    &as->arcstat_mru_evictable_data,
 	    &as->arcstat_mru_evictable_metadata);
 	arc_kstat_update_state(arc_mru_ghost,
 	    &as->arcstat_mru_ghost_size,
 	    &as->arcstat_mru_ghost_data,
 	    &as->arcstat_mru_ghost_metadata,
 	    &as->arcstat_mru_ghost_evictable_data,
 	    &as->arcstat_mru_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_mfu,
 	    &as->arcstat_mfu_size,
 	    &as->arcstat_mfu_data,
 	    &as->arcstat_mfu_metadata,
 	    &as->arcstat_mfu_evictable_data,
 	    &as->arcstat_mfu_evictable_metadata);
 	arc_kstat_update_state(arc_mfu_ghost,
 	    &as->arcstat_mfu_ghost_size,
 	    &as->arcstat_mfu_ghost_data,
 	    &as->arcstat_mfu_ghost_metadata,
 	    &as->arcstat_mfu_ghost_evictable_data,
 	    &as->arcstat_mfu_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_uncached,
 	    &as->arcstat_uncached_size,
 	    &as->arcstat_uncached_data,
 	    &as->arcstat_uncached_metadata,
 	    &as->arcstat_uncached_evictable_data,
 	    &as->arcstat_uncached_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_hits);
 	as->arcstat_l2_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_misses);
 	as->arcstat_l2_prefetch_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
 	as->arcstat_l2_mru_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mru_asize);
 	as->arcstat_l2_mfu_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
 	as->arcstat_l2_bufc_data_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
 	as->arcstat_l2_bufc_metadata_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	as->arcstat_l2_feeds.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_feeds);
 	as->arcstat_l2_rw_clash.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rw_clash);
 	as->arcstat_l2_read_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_read_bytes);
 	as->arcstat_l2_write_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_write_bytes);
 	as->arcstat_l2_writes_sent.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_sent);
 	as->arcstat_l2_writes_done.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_done);
 	as->arcstat_l2_writes_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_error);
 	as->arcstat_l2_writes_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
 	as->arcstat_l2_evict_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
 	as->arcstat_l2_evict_reading.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_reading);
 	as->arcstat_l2_evict_l1cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
 	as->arcstat_l2_free_on_write.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_free_on_write);
 	as->arcstat_l2_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
 	as->arcstat_l2_cksum_bad.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
 	as->arcstat_l2_io_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_io_error);
 	as->arcstat_l2_lsize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_lsize);
 	as->arcstat_l2_psize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_psize);
 	as->arcstat_l2_hdr_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_l2_hdr_size);
 	as->arcstat_l2_log_blk_writes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
 	as->arcstat_l2_log_blk_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
 	as->arcstat_l2_log_blk_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
 	as->arcstat_l2_rebuild_success.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
 	as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	as->arcstat_l2_rebuild_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
 	as->arcstat_l2_rebuild_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
 	as->arcstat_l2_rebuild_bufs.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
 	as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	as->arcstat_l2_rebuild_log_blks.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
 	as->arcstat_memory_throttle_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_throttle_count);
 	as->arcstat_memory_direct_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_direct_count);
 	as->arcstat_memory_indirect_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_indirect_count);
 
 	as->arcstat_memory_all_bytes.value.ui64 =
 	    arc_all_memory();
 	as->arcstat_memory_free_bytes.value.ui64 =
 	    arc_free_memory();
 	as->arcstat_memory_available_bytes.value.i64 =
 	    arc_available_memory();
 
 	as->arcstat_prune.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prune);
 	as->arcstat_meta_used.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_meta_used);
 	as->arcstat_async_upgrade_sync.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
 	as->arcstat_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_predictive_prefetch);
 	as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	as->arcstat_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prescient_prefetch);
 	as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	as->arcstat_raw_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_raw_size);
 	as->arcstat_cached_only_in_progress.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
 	as->arcstat_abd_chunk_waste_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
 
 	return (0);
 }
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the ARC eviction
  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 arc_state_multilist_index_func(multilist_t *ml, void *obj)
 {
 	arc_buf_hdr_t *hdr = obj;
 
 	/*
 	 * We rely on b_dva to generate evenly distributed index
 	 * numbers using buf_hash below. So, as an added precaution,
 	 * let's make sure we never add empty buffers to the arc lists.
 	 */
 	ASSERT(!HDR_EMPTY(hdr));
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * arc_buf_hdr_t will remain constant throughout its lifetime
 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
 	 * Thus, we don't need to store the header's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
 	    multilist_get_num_sublists(ml));
 }
 
 static unsigned int
 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
 {
 	panic("Header %p insert into arc_l2c_only %p", obj, ml);
 }
 
 #define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
 	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
 		cmn_err(CE_WARN,				\
 		    "ignoring tunable %s (using %llu instead)",	\
 		    (#tuning), (u_longlong_t)(value));	\
 	}							\
 } while (0)
 
 /*
  * Called during module initialization and periodically thereafter to
  * apply reasonable changes to the exposed performance tunings.  Can also be
  * called explicitly by param_set_arc_*() functions when ARC tunables are
  * updated manually.  Non-zero zfs_* values which differ from the currently set
  * values will be applied.
  */
 void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
 	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_min <= arc_c_max)) {
 		arc_c_min = zfs_arc_min;
 		arc_c = MAX(arc_c, arc_c_min);
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
 
 	/* Valid range: 64M - <all physical memory> */
 	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
 	    (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
 		if (arc_dnode_limit > arc_c_max)
 			arc_dnode_limit = arc_c_max;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
 	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
 	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prescient_prefetch_ms) {
 		arc_min_prescient_prefetch_ms =
 		    zfs_arc_min_prescient_prefetch_ms;
 	}
 
 	/* Valid range: 0 - 100 */
 	if (zfs_arc_lotsfree_percent <= 100)
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
 		arc_sys_free = MIN(zfs_arc_sys_free, allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
 arc_state_multilist_init(multilist_t *ml,
     multilist_sublist_index_func_t *index_func, int *maxcountp)
 {
 	multilist_create(ml, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
 	*maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
 }
 
 static void
 arc_state_init(void)
 {
 	int num_sublists = 0;
 
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.  Special index function asserts that.
 	 */
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 
 	/*
 	 * Keep track of the number of markers needed to reclaim buffers from
 	 * any ARC state.  The markers will be pre-allocated so as to minimize
 	 * the number of memory allocations performed by the eviction thread.
 	 */
 	arc_state_evict_marker_count = num_sublists;
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 
 	wmsum_init(&arc_sums.arcstat_hits, 0);
 	wmsum_init(&arc_sums.arcstat_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_mru_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
 	wmsum_init(&arc_sums.arcstat_deleted, 0);
 	wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
 	wmsum_init(&arc_sums.arcstat_access_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
 	wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
 	wmsum_init(&arc_sums.arcstat_hash_chains, 0);
 	aggsum_init(&arc_sums.arcstat_size, 0);
 	wmsum_init(&arc_sums.arcstat_compressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_overhead_size, 0);
 	wmsum_init(&arc_sums.arcstat_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
 	wmsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
 	wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
 	wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
 	wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
 	wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_psize, 0);
 	aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
 	wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
 	wmsum_init(&arc_sums.arcstat_prune, 0);
 	wmsum_init(&arc_sums.arcstat_meta_used, 0);
 	wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
 	wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_raw_size, 0);
 	wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
 	wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 	arc_uncached->arcs_state = ARC_STATE_UNCACHED;
 }
 
 static void
 arc_state_fini(void)
 {
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
 
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 
 	wmsum_fini(&arc_sums.arcstat_hits);
 	wmsum_fini(&arc_sums.arcstat_iohits);
 	wmsum_fini(&arc_sums.arcstat_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_data_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_mru_hits);
 	wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_uncached_hits);
 	wmsum_fini(&arc_sums.arcstat_deleted);
 	wmsum_fini(&arc_sums.arcstat_mutex_miss);
 	wmsum_fini(&arc_sums.arcstat_access_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_not_enough);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
 	wmsum_fini(&arc_sums.arcstat_hash_collisions);
 	wmsum_fini(&arc_sums.arcstat_hash_chains);
 	aggsum_fini(&arc_sums.arcstat_size);
 	wmsum_fini(&arc_sums.arcstat_compressed_size);
 	wmsum_fini(&arc_sums.arcstat_uncompressed_size);
 	wmsum_fini(&arc_sums.arcstat_overhead_size);
 	wmsum_fini(&arc_sums.arcstat_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
 	wmsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
 	wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_feeds);
 	wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
 	wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_done);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_error);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
 	wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
 	wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
 	wmsum_fini(&arc_sums.arcstat_l2_io_error);
 	wmsum_fini(&arc_sums.arcstat_l2_lsize);
 	wmsum_fini(&arc_sums.arcstat_l2_psize);
 	aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
 	wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
 	wmsum_fini(&arc_sums.arcstat_memory_direct_count);
 	wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
 	wmsum_fini(&arc_sums.arcstat_prune);
 	wmsum_fini(&arc_sums.arcstat_meta_used);
 	wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
 	wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_raw_size);
 	wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
 	wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
 }
 
 uint64_t
 arc_target_bytes(void)
 {
 	return (arc_c);
 }
 
 void
 arc_set_limits(uint64_t allmem)
 {
 	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
 	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
 
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
 void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
 	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
 
 #if defined(_KERNEL)
 	arc_lowmem_init();
 #endif
 
 	arc_set_limits(allmem);
 
 #ifdef _KERNEL
 	/*
 	 * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
 	 * environment before the module was loaded, don't block setting the
 	 * maximum because it is less than arc_c_min, instead, reset arc_c_min
 	 * to a lower value.
 	 * zfs_arc_min will be handled by arc_tuning_update().
 	 */
 	if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
 	    zfs_arc_max < allmem) {
 		arc_c_max = zfs_arc_max;
 		if (arc_c_min >= arc_c_max) {
 			arc_c_min = MAX(zfs_arc_max / 2,
 			    2ULL << SPA_MAXBLOCKSHIFT);
 		}
 	}
 #else
 	/*
 	 * In userland, there's only the memory pressure that we artificially
 	 * create (see arc_available_memory()).  Don't let arc_c get too
 	 * small, because it can cause transactions to be larger than
 	 * arc_c, causing arc_tempreserve_space() to fail.
 	 */
 	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
 #endif
 
 	arc_c = arc_c_min;
 	/*
 	 * 32-bit fixed point fractions of metadata from total ARC size,
 	 * MRU data from all data and MRU metadata from all metadata.
 	 */
 	arc_meta = (1ULL << 32) / 4;	/* Metadata is 25% of arc_c. */
 	arc_pd = (1ULL << 32) / 2;	/* Data MRU is 50% of data. */
 	arc_pm = (1ULL << 32) / 2;	/* Metadata MRU is 50% of metadata. */
 
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
 	arc_dnode_limit = arc_c_max * percent / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_register_hotplug();
 
 	arc_state_init();
 
 	buf_init();
 
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	arc_state_evict_markers =
 	    arc_state_alloc_markers(arc_state_evict_marker_count);
 	arc_evict_zthr = zthr_create_timer("arc_evict",
 	    arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
 
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
 	 */
 #ifdef __LP64__
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #else
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #endif
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = allmem *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 
 	if (zfs_wrlog_data_max == 0) {
 
 		/*
 		 * dp_wrlog_total is reduced for each txg at the end of
 		 * spa_sync(). However, dp_dirty_total is reduced every time
 		 * a block is written out. Thus under normal operation,
 		 * dp_wrlog_total could grow 2 times as big as
 		 * zfs_dirty_data_max.
 		 */
 		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
 #ifdef _KERNEL
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_remove_head(&arc_prune_list)) != NULL) {
 		zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 	arc_state_free_markers(arc_state_evict_markers,
 	    arc_state_evict_marker_count);
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
 	 */
 	l2arc_do_free_on_write();
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
 	 * trigger the release of kmem magazines, which can callback to
 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
 	 */
 	buf_fini();
 	arc_state_fini();
 
 	arc_unregister_hotplug();
 
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
 	 * wakeup() signals after they are destroyed.
 	 */
 	zthr_destroy(arc_evict_zthr);
 	zthr_destroy(arc_reap_zthr);
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  *
  * L2ARC persistence:
  *
  * When writing buffers to L2ARC, we periodically add some metadata to
  * make sure we can pick them up after reboot, thus dramatically reducing
  * the impact that any downtime has on the performance of storage systems
  * with large caches.
  *
  * The implementation works fairly simply by integrating the following two
  * modifications:
  *
  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
  *    which is an additional piece of metadata which describes what's been
  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
  *    time-wise and offset-wise interleaved, but that is an optimization rather
  *    than for correctness. The log block also includes a pointer to the
  *    previous block in its chain.
  *
  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
  *    for our header bookkeeping purposes. This contains a device header,
  *    which contains our top-level reference structures. We update it each
  *    time we write a new log block, so that we're able to locate it in the
  *    L2ARC device. If this write results in an inconsistent device header
  *    (e.g. due to power failure), we detect this by verifying the header's
  *    checksum and simply fail to reconstruct the L2ARC after reboot.
  *
  * Implementation diagram:
  *
  * +=== L2ARC device (not to scale) ======================================+
  * |       ___two newest log block pointers__.__________                  |
  * |      /                                   \dh_start_lbps[1]           |
  * |	 /				       \         \dh_start_lbps[0]|
  * |.___/__.                                    V         V               |
  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
  * ||   hdr|      ^         /^       /^        /         /                |
  * |+------+  ...--\-------/  \-----/--\------/         /                 |
  * |                \--------------/    \--------------/                  |
  * +======================================================================+
  *
  * As can be seen on the diagram, rather than using a simple linked list,
  * we use a pair of linked lists with alternating elements. This is a
  * performance enhancement due to the fact that we only find out the
  * address of the next log block access once the current block has been
  * completely read in. Obviously, this hurts performance, because we'd be
  * keeping the device's I/O queue at only a 1 operation deep, thus
  * incurring a large amount of I/O round-trip latency. Having two lists
  * allows us to fetch two log blocks ahead of where we are currently
  * rebuilding L2ARC buffers.
  *
  * On-device data structures:
  *
  * L2ARC device header:	l2arc_dev_hdr_phys_t
  * L2ARC log block:	l2arc_log_blk_phys_t
  *
  * L2ARC reconstruction:
  *
  * When writing data, we simply write in the standard rotary fashion,
  * evicting buffers as we go and simply writing new data over them (writing
  * a new log block every now and then). This obviously means that once we
  * loop around the end of the device, we will start cutting into an already
  * committed log block (and its referenced data buffers), like so:
  *
  *    current write head__       __old tail
  *                        \     /
  *                        V    V
  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
  *                         ^    ^^^^^^^^^___________________________________
  *                         |                                                \
  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
  *
  * When importing the pool, we detect this situation and use it to stop
  * our scanning process (see l2arc_rebuild).
  *
  * There is one significant caveat to consider when rebuilding ARC contents
  * from an L2ARC device: what about invalidated buffers? Given the above
  * construction, we cannot update blocks which we've already written to amend
  * them to remove buffers which were invalidated. Thus, during reconstruction,
  * we might be populating the cache with buffers for data that's not on the
  * main pool anymore, or may have been overwritten!
  *
  * As it turns out, this isn't a problem. Every arc_read request includes
  * both the DVA and, crucially, the birth TXG of the BP the caller is
  * looking for. So even if the cache were populated by completely rotten
  * blocks for data that had been long deleted and/or overwritten, we'll
  * never actually return bad data from the cache, since the DVA with the
  * birth TXG uniquely identify a block in space and time - once created,
  * a block is immutable on disk. The worst thing we have done is wasted
  * some time and memory at l2arc rebuild to reconstruct outdated ARC
  * entries that will get dropped from the l2arc as it is being updated
  * with new blocks.
  *
  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
  * hand are not restored. This is done by saving the offset (in bytes)
  * l2arc_evict() has evicted to in the L2ARC device header and taking it
  * into account when restoring buffers.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
 		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	/* We need to add in the worst case scenario of log block overhead. */
 	size += l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
 		/*
 		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
 		 * times the writesize, whichever is greater.
 		 */
 		size += MAX(64 * 1024 * 1024,
 		    (size * l2arc_trim_ahead) / 100);
 	}
 
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
 	size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
 
 	size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 		ASSERT3P(next, !=, NULL);
 	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all);
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild ||
 	    next->l2ad_trim_all)
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	l2arc_data_free_t *df;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t	*cb;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	l2arc_dev_t		*dev;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	list_t			*buflist;
 	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
 	kmutex_t		*hash_lock;
 	int64_t			bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
 	l2dhdr = dev->l2ad_dev_hdr;
 	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
 	ASSERT3P(head, !=, NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock. We must retry so we
 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
 
 			/*
 			 * We don't want to rescan the headers we've
 			 * already marked as having been written out, so
 			 * we reinsert the head node so we can pick up
 			 * where we left off.
 			 */
 			list_remove(buflist, head);
 			list_insert_after(buflist, hdr, head);
 
 			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * We wait for the hash lock to become available
 			 * to try and prevent busy waiting, and increase
 			 * the chance we'll be able to acquire the lock
 			 * the next time around.
 			 */
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		/*
 		 * We could not have been moved into the arc_l2c_only
 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
 		 * bit being set. Let's just ensure that's being enforced.
 		 */
 		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		/*
 		 * Skipped - drop L2ARC entry and mark the header as no
 		 * longer L2 eligibile.
 		 */
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			l2arc_hdr_arcstats_decrement(hdr);
 
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
 		 * Allow ARC to begin reads and ghost list evictions to
 		 * this L2ARC entry.
 		 */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	/*
 	 * Free the allocated abd buffers for writing the log blocks.
 	 * If the zio failed reclaim the allocated space and remove the
 	 * pointers to these log blocks from the log block pointer list
 	 * of the L2ARC device.
 	 */
 	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
 		abd_free(abd_buf->abd);
 		zio_buf_free(abd_buf, sizeof (*abd_buf));
 		if (zio->io_error != 0) {
 			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
 			/*
 			 * L2BLK_GET_PSIZE returns aligned size for log
 			 * blocks.
 			 */
 			uint64_t asize =
 			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
 			bytes_dropped += asize;
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 	list_destroy(&cb->l2wcb_abd_list);
 
 	if (zio->io_error != 0) {
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 		/*
 		 * Restore the lbps array in the header to its previous state.
 		 * If the list of log block pointers is empty, zero out the
 		 * log block pointers in the device header.
 		 */
 		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
 		for (int i = 0; i < 2; i++) {
 			if (lb_ptr_buf == NULL) {
 				/*
 				 * If the list is empty zero out the device
 				 * header. Otherwise zero out the second log
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
 					memset(l2dhdr, 0,
 					    dev->l2ad_dev_hdr_asize);
 				} else {
 					memset(&l2dhdr->dh_start_lbps[i], 0,
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
 			memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
 		}
 	}
 
 	ARCSTAT_BUMP(arcstat_l2_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 static int
 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 {
 	int ret;
 	spa_t *spa = zio->io_spa;
 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/*
 	 * ZIL data is never be written to the L2ARC, so we don't need
 	 * special handling for its unique MAC storage.
 	 */
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the data was encrypted, decrypt it now. Note that
 	 * we must check the bp here and not the hdr, since the
 	 * hdr does not have its encryption parameters updated
 	 * until arc_read_done().
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
 		    hdr->b_l1hdr.b_pabd, &no_crypt);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		/*
 		 * If we actually performed decryption, replace b_pabd
 		 * with the decrypted data. Otherwise we can just throw
 		 * our decryption buffer away.
 		 */
 		if (!no_crypt) {
 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 			    arc_hdr_size(hdr), hdr);
 			hdr->b_l1hdr.b_pabd = eabd;
 			zio->io_abd = eabd;
 		} else {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 		}
 	}
 
 	/*
 	 * If the L2ARC block was compressed, but ARC compression
 	 * is disabled we decompress the data into a new buffer and
 	 * replace the existing data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 		zio->io_abd = cabd;
 		zio->io_size = HDR_GET_LSIZE(hdr);
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	int tfm_error = 0;
 	l2arc_read_callback_t *cb = zio->io_private;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	boolean_t valid_cksum;
 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	ASSERT3P(cb, !=, NULL);
 	hdr = cb->l2rcb_hdr;
 	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the data was read into a temporary buffer,
 	 * move it and free the buffer.
 	 */
 	if (cb->l2rcb_abd != NULL) {
 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
 		if (zio->io_error == 0) {
 			if (using_rdata) {
 				abd_copy(hdr->b_crypt_hdr.b_rabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			} else {
 				abd_copy(hdr->b_l1hdr.b_pabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			}
 		}
 
 		/*
 		 * The following must be done regardless of whether
 		 * there was an error:
 		 * - free the temporary buffer
 		 * - point zio to the real ARC buffer
 		 * - set zio size accordingly
 		 * These are required because zio is either re-used for
 		 * an I/O of the block in the case of the error
 		 * or the zio is passed to arc_read_done() and it
 		 * needs real data.
 		 */
 		abd_free(cb->l2rcb_abd);
 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
 
 		if (using_rdata) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			zio->io_abd = zio->io_orig_abd =
 			    hdr->b_crypt_hdr.b_rabd;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
 		}
 	}
 
 	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_prop.zp_complevel = hdr->b_complevel;
 
 	valid_cksum = arc_cksum_is_equal(hdr, zio);
 
 	/*
 	 * b_rabd will always match the data as it exists on disk if it is
 	 * being used. Therefore if we are reading into b_rabd we do not
 	 * attempt to untransform the data.
 	 */
 	if (valid_cksum && !using_rdata)
 		tfm_error = l2arc_untransform(zio, cb);
 
 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
 	    !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!valid_cksum || tfm_error != 0)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 			void *abd = (using_rdata) ?
 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio = zio_read(pio, zio->io_spa, zio->io_bp,
 			    abd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb);
 
 			/*
 			 * Original ZIO will be freed, so we need to update
 			 * ARC header with the new ZIO pointer to be used
 			 * by zio_change_priority() in arc_read().
 			 */
 			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
 			    acb != NULL; acb = acb->acb_next)
 				acb->acb_zio_head = zio;
 
 			mutex_exit(hash_lock);
 			zio_nowait(zio);
 		} else {
 			mutex_exit(hash_lock);
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static multilist_sublist_t *
 l2arc_sublist_lock(int list_num)
 {
 	multilist_t *ml = NULL;
 	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
 		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
 		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	default:
 		return (NULL);
 	}
 
 	/*
 	 * Return a randomly-selected sublist. This is acceptable
 	 * because the caller feeds only a little bit of data for each
 	 * call (8MB). Subsequent calls will result in different
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
 	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
  * overhead in processing to make sure there is enough headroom available
  * when writing buffers.
  */
 static inline uint64_t
 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 {
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
 		    dev->l2ad_log_entries - 1) /
 		    dev->l2ad_log_entries;
 
 		return (vdev_psize_to_asize(dev->l2ad_vdev,
 		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
 	}
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
 	buflist = &dev->l2ad_buflist;
 
 top:
 	rerun = B_FALSE;
 	if (dev->l2ad_hand + distance > dev->l2ad_end) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
 		 * to the start and iterate. This iteration does not
 		 * happen indefinitely as we make sure in
 		 * l2arc_write_size() that when the write hand is reset,
 		 * the write size does not exceed the end of the device.
 		 */
 		rerun = B_TRUE;
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 	if (!all) {
 		/*
 		 * This check has to be placed after deciding whether to
 		 * iterate (rerun).
 		 */
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
 			 * nothing to evict. We have already trimmmed the
 			 * whole device.
 			 */
 			goto out;
 		} else {
 			/*
 			 * Trim the space to be evicted.
 			 */
 			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
 			    l2arc_trim_ahead > 0) {
 				/*
 				 * We have to drop the spa_config lock because
 				 * vdev_trim_range() will acquire it.
 				 * l2ad_evict already accounts for the label
 				 * size. To prevent vdev_trim_ranges() from
 				 * adding it again, we subtract it from
 				 * l2ad_evict.
 				 */
 				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
 				vdev_trim_simple(vd,
 				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
 				    taddr - dev->l2ad_evict);
 				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
 				    RW_READER);
 			}
 
 			/*
 			 * When rebuilding L2ARC we retrieve the evict hand
 			 * from the header of the device. Of note, l2arc_evict()
 			 * does not actually delete buffers from the cache
 			 * device, but trimming may do so depending on the
 			 * hardware implementation. Thus keeping track of the
 			 * evict hand is useful.
 			 */
 			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
 		}
 	}
 
 retry:
 	mutex_enter(&dev->l2ad_mtx);
 	/*
 	 * We have to account for evicted log blocks. Run vdev_space_update()
 	 * on log blocks whose offset (in bytes) is before the evicted offset
 	 * (in bytes) by searching in the list of pointers to log blocks
 	 * present in the L2ARC device.
 	 */
 	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
 	    lb_ptr_buf = lb_ptr_buf_prev) {
 
 		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		uint64_t asize = L2BLK_GET_PSIZE(
 		    (lb_ptr_buf->lb_ptr)->lbp_prop);
 
 		/*
 		 * We don't worry about log blocks left behind (ie
 		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
 		 * will never write more than l2arc_evict() evicts.
 		 */
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
 			vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		ASSERT(!HDR_EMPTY(hdr));
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto retry;
 		}
 
 		/*
 		 * A header can't be on this list if it doesn't have L2 header.
 		 */
 		ASSERT(HDR_HAS_L2HDR(hdr));
 
 		/* Ensure this header has finished being written. */
 		ASSERT(!HDR_L2_WRITING(hdr));
 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
 
 		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 out:
 	/*
 	 * We need to check if we evict all buffers, otherwise we may iterate
 	 * unnecessarily.
 	 */
 	if (!all && rerun) {
 		/*
 		 * Bump device hand to the device start if it is approaching the
 		 * end. l2arc_evict() has already evicted ahead for this case.
 		 */
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 		goto top;
 	}
 
 	if (!all) {
 		/*
 		 * In case of cache device removal (all) the following
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
 		ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
 		if (!dev->l2ad_first)
 			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 	}
 }
 
 /*
  * Handle any abd transforms that might be required for writing to the L2ARC.
  * If successful, this function will always return an abd with the data
  * transformed as it is on disk in a new abd of asize bytes.
  */
 static int
 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t size = arc_hdr_size(hdr);
 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	dsl_crypto_key_t *dck = NULL;
 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
 	ASSERT3U(psize, <=, asize);
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr)) {
 		ASSERT3U(asize, >, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
 		abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
 	    !HDR_ENCRYPTED(hdr)) {
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 		if (asize > size)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
 		size_t bufsize = MAX(size, asize);
 		void *buf = zio_buf_alloc(bufsize);
 		uint64_t csize = zio_compress_data(compress, to_write, &buf,
 		    size, hdr->b_complevel);
 		if (csize > psize) {
 			/*
 			 * We can't re-compress the block into the original
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
 			zio_buf_free(buf, bufsize);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
 			memset((char *)buf + csize, 0, asize - csize);
 		to_write = cabd = abd_get_from_buf(buf, bufsize);
 		abd_take_ownership_of_buf(cabd, B_TRUE);
 	}
 
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
 		/*
 		 * If the dataset was disowned before the buffer
 		 * made it to this point, the key to re-encrypt
 		 * it won't be available. In this case we simply
 		 * won't write the buffer to the L2ARC.
 		 */
 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
 		    FTAG, &dck);
 		if (ret != 0)
 			goto error;
 
 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
 		    &no_crypt);
 		if (ret != 0)
 			goto error;
 
 		if (no_crypt)
 			abd_copy(eabd, to_write, psize);
 
 		if (psize != asize)
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
 		ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
 			abd_free(cabd);
 
 		to_write = eabd;
 	}
 
 out:
 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
 	*abd_out = to_write;
 	return (0);
 
 error:
 	if (dck != NULL)
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 	if (cabd != NULL)
 		abd_free(cabd);
 	if (eabd != NULL)
 		abd_free(eabd);
 
 	*abd_out = NULL;
 	return (ret);
 }
 
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 
 	cb = zio->io_private;
 	if (cb->l2rcb_abd != NULL)
 		abd_free(cb->l2rcb_abd);
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment and the
  * writing of log blocks).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t 		*hdr, *head, *marker;
 	uint64_t 		write_asize, write_psize, headroom;
 	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
 	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
-		 * If pass == 1 or 3, we cache MRU metadata and data
-		 * respectively.
+		 * pass == 0: MFU meta
+		 * pass == 1: MRU meta
+		 * pass == 2: MFU data
+		 * pass == 3: MRU data
 		 */
-		if (l2arc_mfuonly) {
+		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
+		} else if (l2arc_mfuonly > 1) {
+			if (pass == 3)
+				continue;
 		}
 
 		uint64_t passed_sz = 0;
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		ASSERT3P(mls, !=, NULL);
 		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
 		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 skip:
 				/* Skip this buffer rather than waiting. */
 				if (from_head)
 					hdr = multilist_sublist_next(mls, hdr);
 				else
 					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
 			passed_sz += HDR_GET_LSIZE(hdr);
 			if (l2arc_headroom != 0 && passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
 			/*
 			 * If the allocated size of this buffer plus the max
 			 * size for the pending log block exceeds the evicted
 			 * target size, terminate writing buffers for this run.
 			 */
 			if (write_asize + asize +
 			    sizeof (l2arc_log_blk_phys_t) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
 			 * We should not sleep with sublist lock held or it
 			 * may block ARC eviction.  Insert a marker to save
 			 * the position and drop the lock.
 			 */
 			if (from_head) {
 				multilist_sublist_insert_after(mls, hdr,
 				    marker);
 			} else {
 				multilist_sublist_insert_before(mls, hdr,
 				    marker);
 			}
 			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
 			 * must always match the data exactly as it exists on
 			 * disk. Otherwise, the L2ARC can normally use the
 			 * hdr's data, but if we're sharing data between the
 			 * hdr and one of its bufs, L2ARC needs its own copy of
 			 * the data so that the ZIO below can't race with the
 			 * buf consumer. To ensure that this copy will be
 			 * available for the lifetime of the ZIO and be cleaned
 			 * up afterwards, we add it to the l2arc_free_on_write
 			 * queue. If we need to apply any transforms to the
 			 * data (compression, encryption) we will also need the
 			 * extra buffer.
 			 */
 			if (HDR_HAS_RABD(hdr) && psize == asize) {
 				to_write = hdr->b_crypt_hdr.b_rabd;
 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
 			    psize == asize) {
 				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
 				int ret;
 				arc_buf_contents_t type = arc_buf_type(hdr);
 
 				ret = l2arc_apply_transforms(spa, hdr, asize,
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
 					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
 					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
 			hdr->b_l2hdr.b_dev = dev;
 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 			hdr->b_l2hdr.b_hits = 0;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
 			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				list_insert_head(&dev->l2ad_buflist, head);
 			}
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
 			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
 			    ARC_FLAG_L2_WRITING);
 
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 			l2arc_hdr_arcstats_increment(hdr);
 
 			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
 			mutex_exit(hash_lock);
 
 			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			if (commit) {
 				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
 next:
 			multilist_sublist_lock(mls);
 			if (from_head)
 				hdr = multilist_sublist_next(mls, marker);
 			else
 				hdr = multilist_sublist_prev(mls, marker);
 			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	arc_state_free_marker(marker);
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
 		/*
 		 * Although we did not write any buffers l2ad_evict may
 		 * have advanced.
 		 */
 		if (dev->l2ad_evict != l2dhdr->dh_evict)
 			l2arc_dev_hdr_update(dev);
 
 		return (0);
 	}
 
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
 	 * pointers in the device header.
 	 */
 	l2arc_dev_hdr_update(dev);
 
 	return (write_asize);
 }
 
 static boolean_t
 l2arc_hdr_limit_reached(void)
 {
 	int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
 
 	return (arc_reclaim_needed() ||
 	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static  __attribute__((noreturn)) void
 l2arc_feed_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	return (l2arc_vdev_get(vd) != NULL);
 }
 
 /*
  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
  * the vdev_t isn't an L2ARC device.
  */
 l2arc_dev_t *
 l2arc_vdev_get(vdev_t *vd)
 {
 	l2arc_dev_t	*dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev);
 }
 
 static void
 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 {
 	l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	spa_t *spa = dev->l2ad_spa;
 
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
 	 * depends on the amount of its log entries. We always write log blocks
 	 * with 1022 entries. How many of them are committed or restored depends
 	 * on the size of the L2ARC device. Thus the maximum payload of
 	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
 	 * is less than that, we reduce the amount of committed and restored
 	 * log entries per block so as to enable persistence.
 	 */
 	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
 		dev->l2ad_log_entries = 0;
 	} else {
 		dev->l2ad_log_entries = MIN((dev->l2ad_end -
 		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
 		    L2ARC_LOG_BLK_MAX_ENTRIES);
 	}
 
 	/*
 	 * Read the device header, if an error is returned do not rebuild L2ARC.
 	 */
 	if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
 		/*
 		 * If we are onlining a cache device (vdev_reopen) that was
 		 * still present (l2arc_vdev_present()) and rebuild is enabled,
 		 * we should evict all ARC buffers and pointers to log blocks
 		 * and reclaim their space before restoring its contents to
 		 * L2ARC.
 		 */
 		if (reopen) {
 			if (!l2arc_rebuild_enabled) {
 				return;
 			} else {
 				l2arc_evict(dev, 0, B_TRUE);
 				/* start a new log block */
 				dev->l2ad_log_ent_idx = 0;
 				dev->l2ad_log_blk_payload_asize = 0;
 				dev->l2ad_log_blk_payload_start = 0;
 			}
 		}
 		/*
 		 * Just mark the device as pending for a rebuild. We won't
 		 * be starting a rebuild in line here as it would block pool
 		 * import. Instead spa_load_impl will hand that off to an
 		 * async task which will call l2arc_spa_rebuild_start.
 		 */
 		dev->l2ad_rebuild = B_TRUE;
 	} else if (spa_writeable(spa)) {
 		/*
 		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
 		 * otherwise create a new header. We zero out the memory holding
 		 * the header to reset dh_start_lbps. If we TRIM the whole
 		 * device the new header will be written by
 		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
 		 * trim_state in the header too. When reading the header, if
 		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
 		 * we opt to TRIM the whole device again.
 		 */
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
 			memset(l2dhdr, 0, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t		*adddev;
 	uint64_t		l2dhdr_asize;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	/* leave extra size for an l2arc device header */
 	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
 	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	/*
 	 * This is a list of pointers to log blocks that are still present
 	 * on the device.
 	 */
 	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
 	    offsetof(l2arc_lb_ptr_buf_t, node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
 	/*
 	 * Decide if dev is eligible for L2ARC rebuild or whole device
 	 * trimming. This has to happen before the device is added in the
 	 * cache device list and l2arc_dev_mtx is released. Otherwise
 	 * l2arc_feed_thread() might already start writing on the
 	 * device.
 	 */
 	l2arc_rebuild_dev(adddev, B_FALSE);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
  * in case of onlining a cache device.
  */
 void
 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 {
 	l2arc_dev_t		*dev = NULL;
 
 	dev = l2arc_vdev_get(vd);
 	ASSERT3P(dev, !=, NULL);
 
 	/*
 	 * In contrast to l2arc_add_vdev() we do not have to worry about
 	 * l2arc_feed_thread() invalidating previous content when onlining a
 	 * cache device. The device parameters (l2ad*) are not cleared when
 	 * offlining the device and writing new buffers will not invalidate
 	 * all previous content. In worst case only buffers that have not had
 	 * their log block written to the device will be lost.
 	 * When onlining the cache device (ie offline->online without exporting
 	 * the pool in between) this happens:
 	 * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
 	 * 			|			|
 	 * 		vdev_is_dead() = B_FALSE	l2ad_rebuild = B_TRUE
 	 * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
 	 * is set to B_TRUE we might write additional buffers to the device.
 	 */
 	l2arc_rebuild_dev(dev, reopen);
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	l2arc_dev_t *remdev = NULL;
 
 	/*
 	 * Find the device by vdev
 	 */
 	remdev = l2arc_vdev_get(vd);
 	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Cancel any ongoing or scheduled rebuild.
 	 */
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	if (remdev->l2ad_rebuild_began == B_TRUE) {
 		remdev->l2ad_rebuild_cancel = B_TRUE;
 		while (remdev->l2ad_rebuild == B_TRUE)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	/*
 	 * Remove device from global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, defclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
  * be called after pool import from the spa async thread, since starting
  * these threads directly from spa_import() will make them part of the
  * "zpool import" context and delay process exit (and thus pool import).
  */
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL) {
 			/* Don't attempt a rebuild if the vdev is UNAVAIL */
 			continue;
 		}
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
 			dev->l2ad_rebuild_began = B_TRUE;
 			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
 			    dev, 0, &p0, TS_RUN, minclsyspri);
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 /*
  * Main entry point for L2ARC rebuilding.
  */
 static __attribute__((noreturn)) void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
 
 	VERIFY(!dev->l2ad_rebuild_cancel);
 	VERIFY(dev->l2ad_rebuild);
 	(void) l2arc_rebuild(dev);
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	dev->l2ad_rebuild_began = B_FALSE;
 	dev->l2ad_rebuild = B_FALSE;
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	thread_exit();
 }
 
 /*
  * This function implements the actual L2ARC metadata rebuild. It:
  * starts reading the log block chain and restores each block's contents
  * to memory (reconstructing arc_buf_hdr_t's).
  *
  * Operation stops under any of the following conditions:
  *
  * 1) We reach the end of the log block chain.
  * 2) We encounter *any* error condition (cksum errors, io errors)
  */
 static int
 l2arc_rebuild(l2arc_dev_t *dev)
 {
 	vdev_t			*vd = dev->l2ad_vdev;
 	spa_t			*spa = vd->vdev_spa;
 	int			err = 0;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	l2arc_log_blk_phys_t	*this_lb, *next_lb;
 	zio_t			*this_io = NULL, *next_io = NULL;
 	l2arc_log_blkptr_t	lbps[2];
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	boolean_t		lock_held;
 
 	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
 	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
 
 	/*
 	 * We prevent device removal while issuing reads to the device,
 	 * then during the rebuilding phases we drop this lock again so
 	 * that a spa_unload or device remove can be initiated - this is
 	 * safe, because the spa will signal us to stop before removing
 	 * our device and wait for us to stop.
 	 */
 	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
 	lock_held = B_TRUE;
 
 	/*
 	 * Retrieve the persistent L2ARC device state.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
 	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
 	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
 	    dev->l2ad_start);
 	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
 	vd->vdev_trim_state = l2dhdr->dh_trim_state;
 
 	/*
 	 * In case the zfs module parameter l2arc_rebuild_enabled is false
 	 * we do not start the rebuild process.
 	 */
 	if (!l2arc_rebuild_enabled)
 		goto out;
 
 	/* Prepare the rebuild process */
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
 			break;
 
 		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
 		    this_lb, next_lb, this_io, &next_io)) != 0)
 			goto out;
 
 		/*
 		 * Our memory pressure valve. If the system is running low
 		 * on memory, rather than swamping memory with new ARC buf
 		 * hdrs, we opt not to rebuild the L2ARC. At this point,
 		 * however, we have already set up our L2ARC dev to chain in
 		 * new metadata log blocks, so the user may choose to offline/
 		 * online the L2ARC dev at a later time (or re-import the pool)
 		 * to reconstruct it (when there's less memory pressure).
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
 			cmn_err(CE_NOTE, "System running low on memory, "
 			    "aborting L2ARC rebuild.");
 			err = SET_ERROR(ENOMEM);
 			goto out;
 		}
 
 		spa_config_exit(spa, SCL_L2ARC, vd);
 		lock_held = B_FALSE;
 
 		/*
 		 * Now that we know that the next_lb checks out alright, we
 		 * can start reconstruction from this log block.
 		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 		 */
 		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		l2arc_log_blk_restore(dev, this_lb, asize);
 
 		/*
 		 * log block restored, include its pointer in the list of
 		 * pointers to log blocks present in the L2ARC device.
 		 */
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
 		memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
 		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 		mutex_exit(&dev->l2ad_mtx);
 		vdev_space_update(vd, asize, 0, 0);
 
 		/*
 		 * Protection against loops of log blocks:
 		 *
 		 *				       l2ad_hand  l2ad_evict
 		 *                                         V	      V
 		 * l2ad_start |=======================================| l2ad_end
 		 *             -----|||----|||---|||----|||
 		 *                  (3)    (2)   (1)    (0)
 		 *             ---|||---|||----|||---|||
 		 *		  (7)   (6)    (5)   (4)
 		 *
 		 * In this situation the pointer of log block (4) passes
 		 * l2arc_log_blkptr_valid() but the log block should not be
 		 * restored as it is overwritten by the payload of log block
 		 * (0). Only log blocks (0)-(3) should be restored. We check
 		 * whether l2ad_evict lies in between the payload starting
 		 * offset of the next log block (lbps[1].lbp_payload_start)
 		 * and the payload starting offset of the present log block
 		 * (lbps[0].lbp_payload_start). If true and this isn't the
 		 * first pass, we are looping from the beginning and we should
 		 * stop.
 		 */
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
 		    !dev->l2ad_first)
 			goto out;
 
 		kpreempt(KPREEMPT_SYNC);
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				dev->l2ad_rebuild = B_FALSE;
 				cv_signal(&l2arc_rebuild_thr_cv);
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
 				goto out;
 			}
 			mutex_exit(&l2arc_rebuild_thr_lock);
 			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
 			    RW_READER)) {
 				lock_held = B_TRUE;
 				break;
 			}
 			/*
 			 * L2ARC config lock held by somebody in writer,
 			 * possibly due to them trying to remove us. They'll
 			 * likely to want us to shut down, so after a little
 			 * delay, we check l2ad_rebuild_cancel and retry
 			 * the lock again.
 			 */
 			delay(1);
 		}
 
 		/*
 		 * Continue with the next log block.
 		 */
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb->lb_prev_lbp;
 		PTR_SWAP(this_lb, next_lb);
 		this_io = next_io;
 		next_io = NULL;
 	}
 
 	if (this_io != NULL)
 		l2arc_log_blk_fetch_abort(this_io);
 out:
 	if (next_io != NULL)
 		l2arc_log_blk_fetch_abort(next_io);
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
 	if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "successful, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
 		/*
 		 * No error but also nothing restored, meaning the lbps array
 		 * in the device header points to invalid/non-present log
 		 * blocks. Reset the header.
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
 		memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err == ECANCELED) {
 		/*
 		 * In case the rebuild was canceled do not log to spa history
 		 * log as the pool may be in the process of being removed.
 		 */
 		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	}
 
 	if (lock_held)
 		spa_config_exit(spa, SCL_L2ARC, vd);
 
 	return (err);
 }
 
 /*
  * Attempts to read the device header on the provided L2ARC device and writes
  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
  * error code is returned.
  */
 static int
 l2arc_dev_hdr_read(l2arc_dev_t *dev)
 {
 	int			err;
 	uint64_t		guid;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t 			*abd;
 
 	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		return (err);
 	}
 
 	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
 
 	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
 	    l2dhdr->dh_spa_guid != guid ||
 	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
 	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
 	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
 	    l2dhdr->dh_end != dev->l2ad_end ||
 	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
 	    l2dhdr->dh_evict) ||
 	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
 	    l2arc_trim_ahead > 0)) {
 		/*
 		 * Attempt to rebuild a device containing no actual dev hdr
 		 * or containing a header from some other pool or from another
 		 * version of persistent L2ARC.
 		 */
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 /*
  * Reads L2ARC log blocks from storage and validates their contents.
  *
  * This function implements a simple fetcher to make sure that while
  * we're processing one buffer the L2ARC is already fetching the next
  * one in the chain.
  *
  * The arguments this_lp and next_lp point to the current and next log block
  * address in the block chain. Similarly, this_lb and next_lb hold the
  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
  *
  * The `this_io' and `next_io' arguments are used for block fetching.
  * When issuing the first blk IO during rebuild, you should pass NULL for
  * `this_io'. This function will then issue a sync IO to read the block and
  * also issue an async IO to fetch the next block in the block chain. The
  * fetched IO is returned in `next_io'. On subsequent calls to this
  * function, pass the value returned in `next_io' from the previous call
  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
  * Prior to the call, you should initialize your `next_io' pointer to be
  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
  *
  * On success, this function returns 0, otherwise it returns an appropriate
  * error code. On error the fetching IO is aborted and cleared before
  * returning from this function. Therefore, if we return `success', the
  * caller can assume that we have taken care of cleanup of fetch IOs.
  */
 static int
 l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io)
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
 	abd_t		*abd = NULL;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
 	ASSERT(this_lb != NULL && next_lb != NULL);
 	ASSERT(next_io != NULL && *next_io == NULL);
 	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
 
 	/*
 	 * Check to see if we have issued the IO for this log block in a
 	 * previous run. If not, this is the first call, so issue it now.
 	 */
 	if (this_io == NULL) {
 		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
 		    this_lb);
 	}
 
 	/*
 	 * Peek to see if we can start issuing the next IO immediately.
 	 */
 	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
 		/*
 		 * Start issuing IO for the next log block early - this
 		 * should help keep the L2ARC device busy while we
 		 * decompress and restore this log block.
 		 */
 		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
 		    next_lb);
 	}
 
 	/* Wait for the IO to read this log block to complete */
 	if ((err = zio_wait(this_io)) != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
 		    "offset: %llu, vdev guid: %llu", err,
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		goto cleanup;
 	}
 
 	/*
 	 * Make sure the buffer checks out.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
 	fletcher_4_native(this_lb, asize, NULL, &cksum);
 	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
 		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
 		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid,
 		    (u_longlong_t)dev->l2ad_hand,
 		    (u_longlong_t)dev->l2ad_evict);
 		err = SET_ERROR(ECKSUM);
 		goto cleanup;
 	}
 
 	/* Now we can take our time decoding this buffer */
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
 	case ZIO_COMPRESS_LZ4:
 		abd = abd_alloc_for_io(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
 		if ((err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
 		    abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 		byteswap_uint64_array(this_lb, sizeof (*this_lb));
 	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 cleanup:
 	/* Abort an in-flight fetch I/O in case of error */
 	if (err != 0 && *next_io != NULL) {
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
 	if (abd != NULL)
 		abd_free(abd);
 	return (err);
 }
 
 /*
  * Restores the payload of a log block to ARC. This creates empty ARC hdr
  * entries which only contain an l2arc hdr, essentially restoring the
  * buffers to their L2ARC evicted state. This function also updates space
  * usage on the L2ARC vdev to make sure it tracks restored buffers.
  */
 static void
 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
     uint64_t lb_asize)
 {
 	uint64_t	size = 0, asize = 0;
 	uint64_t	log_entries = dev->l2ad_log_entries;
 
 	/*
 	 * Usually arc_adapt() is called only for data, not headers, but
 	 * since we may allocate significant amount of memory here, let ARC
 	 * grow its arc_c.
 	 */
 	arc_adapt(log_entries * HDR_L2ONLY_SIZE);
 
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
 		 * Restore goes in the reverse temporal direction to preserve
 		 * correct temporal ordering of buffers in the l2ad_buflist.
 		 * l2arc_hdr_restore also does a list_insert_tail instead of
 		 * list_insert_head on the l2ad_buflist:
 		 *
 		 *		LIST	l2ad_buflist		LIST
 		 *		HEAD  <------ (time) ------	TAIL
 		 * direction	+-----+-----+-----+-----+-----+    direction
 		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
 		 * fill		+-----+-----+-----+-----+-----+
 		 *		^				^
 		 *		|				|
 		 *		|				|
 		 *	l2arc_feed_thread		l2arc_rebuild
 		 *	will place new bufs here	restores bufs here
 		 *
 		 * During l2arc_rebuild() the device is not used by
 		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
 		 */
 		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
 		asize += vdev_psize_to_asize(dev->l2ad_vdev,
 		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
 		l2arc_hdr_restore(&lb->lb_entries[i], dev);
 	}
 
 	/*
 	 * Record rebuild stats:
 	 *	size		Logical size of restored buffers in the L2ARC
 	 *	asize		Aligned size of restored buffers in the L2ARC
 	 */
 	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
 	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
 	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
 	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
 }
 
 /*
  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
  * into a state indicating that it has been evicted to L2ARC.
  */
 static void
 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 {
 	arc_buf_hdr_t		*hdr, *exists;
 	kmutex_t		*hash_lock;
 	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
 	uint64_t		asize;
 
 	/*
 	 * Do all the allocation before grabbing any locks, this lets us
 	 * sleep if memory is full and we don't have to deal with failed
 	 * allocations.
 	 */
 	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
 	    dev, le->le_dva, le->le_daddr,
 	    L2BLK_GET_PSIZE((le)->le_prop), le->le_birth,
 	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
 	    L2BLK_GET_PROTECTED((le)->le_prop),
 	    L2BLK_GET_PREFETCH((le)->le_prop),
 	    L2BLK_GET_STATE((le)->le_prop));
 	asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
 	 * avoid underflow since the latter also calls vdev_space_update().
 	 */
 	l2arc_hdr_arcstats_increment(hdr);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_tail(&dev->l2ad_buflist, hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
 	mutex_exit(&dev->l2ad_mtx);
 
 	exists = buf_hash_insert(hdr, &hash_lock);
 	if (exists) {
 		/* Buffer was already cached, no need to restore it. */
 		arc_hdr_destroy(hdr);
 		/*
 		 * If the buffer is already cached, check whether it has
 		 * L2ARC metadata. If not, enter them and update the flag.
 		 * This is important is case of onlining a cache device, since
 		 * we previously evicted all L2ARC metadata from ARC.
 		 */
 		if (!HDR_HAS_L2HDR(exists)) {
 			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
 			exists->b_l2hdr.b_dev = dev;
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			exists->b_l2hdr.b_arcs_state =
 			    L2BLK_GET_STATE((le)->le_prop);
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(exists), exists);
 			mutex_exit(&dev->l2ad_mtx);
 			l2arc_hdr_arcstats_increment(exists);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 		}
 		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
 	}
 
 	mutex_exit(hash_lock);
 }
 
 /*
  * Starts an asynchronous read IO to read a log block. This is used in log
  * block reconstruction to start reading the next block before we are done
  * decoding and reconstructing the current block, to keep the l2arc device
  * nice and hot with read IO to process.
  * The returned zio will contain a newly allocated memory buffers for the IO
  * data which should then be freed by the caller once the zio is no longer
  * needed (i.e. due to it having completed). If you wish to abort this
  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
  * care of disposing of the allocated buffers correctly.
  */
 static zio_t *
 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
     l2arc_log_blk_phys_t *lb)
 {
 	uint32_t		asize;
 	zio_t			*pio;
 	l2arc_read_callback_t	*cb;
 
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
 
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
 }
 
 /*
  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
  * buffers allocated for it.
  */
 static void
 l2arc_log_blk_fetch_abort(zio_t *zio)
 {
 	(void) zio_wait(zio);
 }
 
 /*
  * Creates a zio to update the device header on an l2arc device.
  */
 void
 l2arc_dev_hdr_update(l2arc_dev_t *dev)
 {
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t			*abd;
 	int			err;
 
 	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
 
 	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
 	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
 	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
 	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
 	l2dhdr->dh_evict = dev->l2ad_evict;
 	l2dhdr->dh_start = dev->l2ad_start;
 	l2dhdr->dh_end = dev->l2ad_end;
 	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
 	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
 	l2dhdr->dh_flags = 0;
 	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
 	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
 	if (dev->l2ad_first)
 		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
 	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 	}
 }
 
 /*
  * Commits a log block to the L2ARC device. This routine is invoked from
  * l2arc_write_buffers when the log block fills up.
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
 static uint64_t
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	uint8_t			*tmpbuf = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
 
 	/* link the buffer into the block chain */
 	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
 	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
 
 	/*
 	 * l2arc_log_blk_commit() may be called multiple times during a single
 	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
 	 * so we can free them in l2arc_write_done() later on.
 	 */
 	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
 
 	/* try to compress the buffer */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
 	    abd_buf->abd, (void **) &tmpbuf, sizeof (*lb), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
 	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	ASSERT(asize <= sizeof (*lb));
 
 	/*
 	 * Update the start log block pointer in the device header to point
 	 * to the log block we're about to write.
 	 */
 	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
 	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
 	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
 	    dev->l2ad_log_blk_payload_asize;
 	l2dhdr->dh_start_lbps[0].lbp_payload_start =
 	    dev->l2ad_log_blk_payload_start;
 	L2BLK_SET_LSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
 	L2BLK_SET_PSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
 	L2BLK_SET_CHECKSUM(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
 		memset(tmpbuf + psize, 0, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
 		memcpy(tmpbuf, lb, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
 	fletcher_4_native(tmpbuf, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
 	abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb));
 	abd_take_ownership_of_buf(abd_buf->abd, B_TRUE);
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
 	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
 	memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
 	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
 	    dev->l2ad_log_blk_payload_asize / asize);
 
 	/* start a new log block */
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
 
 	return (asize);
 }
 
 /*
  * Validates an L2ARC log block address to make sure that it can be read
  * from the provided L2ARC device.
  */
 boolean_t
 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
 {
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	uint64_t end = lbp->lbp_daddr + asize - 1;
 	uint64_t start = lbp->lbp_payload_start;
 	boolean_t evicted = B_FALSE;
 
 	/*
 	 * A log block is valid if all of the following conditions are true:
 	 * - it fits entirely (including its payload) between l2ad_start and
 	 *   l2ad_end
 	 * - it has a valid size
 	 * - neither the log block itself nor part of its payload was evicted
 	 *   by l2arc_evict():
 	 *
 	 *		l2ad_hand          l2ad_evict
 	 *		|			 |	lbp_daddr
 	 *		|     start		 |	|  end
 	 *		|     |			 |	|  |
 	 *		V     V		         V	V  V
 	 *   l2ad_start ============================================ l2ad_end
 	 *                    --------------------------||||
 	 *				^		 ^
 	 *				|		log block
 	 *				payload
 	 */
 
 	evicted =
 	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
 	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
 
 	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
 	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
 	    (!evicted || dev->l2ad_first));
 }
 
 /*
  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
  * the device. The buffer being inserted must be present in L2ARC.
  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
  */
 static boolean_t
 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_log_ent_phys_t	*le;
 
 	if (dev->l2ad_log_entries == 0)
 		return (B_FALSE);
 
 	int index = dev->l2ad_log_ent_idx++;
 
 	ASSERT3S(index, <, dev->l2ad_log_entries);
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
 	memset(le, 0, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
 	if (index == 0)
 		dev->l2ad_log_blk_payload_start = le->le_daddr;
 	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
 	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
 	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
 	le->le_complevel = hdr->b_complevel;
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
 	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
 
 	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
 }
 
 /*
  * Checks whether a given L2ARC device address sits in a time-sequential
  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
  * just do a range comparison, we need to handle the situation in which the
  * range wraps around the end of the L2ARC device. Arguments:
  *	bottom -- Lower end of the range to check (written to earlier).
  *	top    -- Upper end of the range to check (written to later).
  *	check  -- The address for which we want to determine if it sits in
  *		  between the top and bottom.
  *
  * The 3-way conditional below represents the following cases:
  *
  *	bottom < top : Sequentially ordered case:
  *	  <check>--------+-------------------+
  *	                 |  (overlap here?)  |
  *	 L2ARC dev       V                   V
  *	 |---------------<bottom>============<top>--------------|
  *
  *	bottom > top: Looped-around case:
  *	                      <check>--------+------------------+
  *	                                     |  (overlap here?) |
  *	 L2ARC dev                           V                  V
  *	 |===============<top>---------------<bottom>===========|
  *	 ^               ^
  *	 |  (or here?)   |
  *	 +---------------+---------<check>
  *
  *	top == bottom : Just a single address comparison.
  */
 boolean_t
 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
 {
 	if (bottom < top)
 		return (bottom <= check && check <= top);
 	else if (bottom > top)
 		return (check <= top || bottom <= check);
 	else
 		return (check == top);
 }
 
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
 	spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
 	spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
 	"Balance between metadata and data on ghost hits.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim ARC to");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
 	"Disable compressed ARC buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
 	"Max write bytes per interval");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
 	"Number of max device writes to precache");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
 	"Skip caching prefetched buffers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 	"Turbo L2ARC warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
 	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
 	"Exclude dbufs on special vdevs from being cached to L2ARC if set.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
 	"When full, ARC allocation waits for eviction of this % of alloc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
 	"The number of headers to evict per sublist before moving to the next");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
 	"Number of arc_prune threads");
diff --git a/sys/contrib/openzfs/module/zfs/dataset_kstats.c b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
index 2ac058fd2c93..5abee12434e9 100644
--- a/sys/contrib/openzfs/module/zfs/dataset_kstats.c
+++ b/sys/contrib/openzfs/module/zfs/dataset_kstats.c
@@ -1,255 +1,258 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2018 by Delphix. All rights reserved.
  * Copyright (c) 2018 Datto Inc.
  */
 
 #include <sys/dataset_kstats.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 
 static dataset_kstat_values_t empty_dataset_kstats = {
 	{ "dataset_name",	KSTAT_DATA_STRING },
 	{ "writes",	KSTAT_DATA_UINT64 },
 	{ "nwritten",	KSTAT_DATA_UINT64 },
 	{ "reads",	KSTAT_DATA_UINT64 },
 	{ "nread",	KSTAT_DATA_UINT64 },
 	{ "nunlinks",	KSTAT_DATA_UINT64 },
 	{ "nunlinked",	KSTAT_DATA_UINT64 },
 	{
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 }
 	}
 };
 
 static int
 dataset_kstats_update(kstat_t *ksp, int rw)
 {
 	dataset_kstats_t *dk = ksp->ks_private;
 	dataset_kstat_values_t *dkv = ksp->ks_data;
 	ASSERT3P(dk->dk_kstats->ks_data, ==, dkv);
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	dkv->dkv_writes.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_writes);
 	dkv->dkv_nwritten.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_nwritten);
 	dkv->dkv_reads.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_reads);
 	dkv->dkv_nread.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_nread);
 	dkv->dkv_nunlinks.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_nunlinks);
 	dkv->dkv_nunlinked.value.ui64 =
 	    wmsum_value(&dk->dk_sums.dss_nunlinked);
 
 	zil_kstat_values_update(&dkv->dkv_zil_stats, &dk->dk_zil_sums);
 
 	return (0);
 }
 
 int
 dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset)
 {
 	/*
 	 * There should not be anything wrong with having kstats for
 	 * snapshots. Since we are not sure how useful they would be
 	 * though nor how much their memory overhead would matter in
 	 * a filesystem with many snapshots, we skip them for now.
 	 */
 	if (dmu_objset_is_snapshot(objset))
 		return (0);
 
 	/*
 	 * At the time of this writing, KSTAT_STRLEN is 255 in Linux,
 	 * and the spa_name can theoretically be up to 256 characters.
 	 * In reality though the spa_name can be 240 characters max
 	 * [see origin directory name check in pool_namecheck()]. Thus,
 	 * the naming scheme for the module name below should not cause
 	 * any truncations. In the event that a truncation does happen
 	 * though, due to some future change, we silently skip creating
 	 * the kstat and log the event.
 	 */
 	char kstat_module_name[KSTAT_STRLEN];
 	int n = snprintf(kstat_module_name, sizeof (kstat_module_name),
 	    "zfs/%s", spa_name(dmu_objset_spa(objset)));
 	if (n < 0) {
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    " snprintf() for kstat module name returned %d",
 		    (unsigned long long)dmu_objset_id(objset), n);
 		return (SET_ERROR(EINVAL));
 	} else if (n >= KSTAT_STRLEN) {
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    "kstat module name length (%d) exceeds limit (%d)",
 		    (unsigned long long)dmu_objset_id(objset),
 		    n, KSTAT_STRLEN);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	char kstat_name[KSTAT_STRLEN];
 	n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx",
 	    (unsigned long long)dmu_objset_id(objset));
 	if (n < 0) {
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    " snprintf() for kstat name returned %d",
 		    (unsigned long long)dmu_objset_id(objset), n);
 		return (SET_ERROR(EINVAL));
 	} else if (n >= KSTAT_STRLEN) {
 		zfs_dbgmsg("failed to create dataset kstat for objset %lld: "
 		    "kstat name length (%d) exceeds limit (%d)",
 		    (unsigned long long)dmu_objset_id(objset),
 		    n, KSTAT_STRLEN);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name,
 	    "dataset", KSTAT_TYPE_NAMED,
 	    sizeof (empty_dataset_kstats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (kstat == NULL)
 		return (SET_ERROR(ENOMEM));
 
 	dataset_kstat_values_t *dk_kstats =
 	    kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP);
 	memcpy(dk_kstats, &empty_dataset_kstats,
 	    sizeof (empty_dataset_kstats));
 
 	char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	dsl_dataset_name(objset->os_dsl_dataset, ds_name);
 	KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name;
 	KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) =
 	    ZFS_MAX_DATASET_NAME_LEN;
 
 	kstat->ks_data = dk_kstats;
 	kstat->ks_update = dataset_kstats_update;
 	kstat->ks_private = dk;
 	kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN;
 
 	wmsum_init(&dk->dk_sums.dss_writes, 0);
 	wmsum_init(&dk->dk_sums.dss_nwritten, 0);
 	wmsum_init(&dk->dk_sums.dss_reads, 0);
 	wmsum_init(&dk->dk_sums.dss_nread, 0);
 	wmsum_init(&dk->dk_sums.dss_nunlinks, 0);
 	wmsum_init(&dk->dk_sums.dss_nunlinked, 0);
 	zil_sums_init(&dk->dk_zil_sums);
 
 	dk->dk_kstats = kstat;
 	kstat_install(kstat);
 	return (0);
 }
 
 void
 dataset_kstats_destroy(dataset_kstats_t *dk)
 {
 	if (dk->dk_kstats == NULL)
 		return;
 
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	kstat_delete(dk->dk_kstats);
 	dk->dk_kstats = NULL;
 	kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name),
 	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
 	kmem_free(dkv, sizeof (empty_dataset_kstats));
 
 	wmsum_fini(&dk->dk_sums.dss_writes);
 	wmsum_fini(&dk->dk_sums.dss_nwritten);
 	wmsum_fini(&dk->dk_sums.dss_reads);
 	wmsum_fini(&dk->dk_sums.dss_nread);
 	wmsum_fini(&dk->dk_sums.dss_nunlinks);
 	wmsum_fini(&dk->dk_sums.dss_nunlinked);
 	zil_sums_fini(&dk->dk_zil_sums);
 }
 
 void
 dataset_kstats_rename(dataset_kstats_t *dk, const char *name)
 {
+	if (dk->dk_kstats == NULL)
+		return;
+
 	dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data;
 	char *ds_name;
 
 	ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name);
 	ASSERT3S(ds_name, !=, NULL);
 	(void) strlcpy(ds_name, name,
 	    KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name));
 }
 
 void
 dataset_kstats_update_write_kstats(dataset_kstats_t *dk,
     int64_t nwritten)
 {
 	ASSERT3S(nwritten, >=, 0);
 
 	if (dk->dk_kstats == NULL)
 		return;
 
 	wmsum_add(&dk->dk_sums.dss_writes, 1);
 	wmsum_add(&dk->dk_sums.dss_nwritten, nwritten);
 }
 
 void
 dataset_kstats_update_read_kstats(dataset_kstats_t *dk,
     int64_t nread)
 {
 	ASSERT3S(nread, >=, 0);
 
 	if (dk->dk_kstats == NULL)
 		return;
 
 	wmsum_add(&dk->dk_sums.dss_reads, 1);
 	wmsum_add(&dk->dk_sums.dss_nread, nread);
 }
 
 void
 dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta)
 {
 	if (dk->dk_kstats == NULL)
 		return;
 
 	wmsum_add(&dk->dk_sums.dss_nunlinks, delta);
 }
 
 void
 dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta)
 {
 	if (dk->dk_kstats == NULL)
 		return;
 
 	wmsum_add(&dk->dk_sums.dss_nunlinked, delta);
 }
diff --git a/sys/contrib/openzfs/module/zfs/zil.c b/sys/contrib/openzfs/module/zfs/zil.c
index 9b5d866a8c22..9293429e43c7 100644
--- a/sys/contrib/openzfs/module/zfs/zil.c
+++ b/sys/contrib/openzfs/module/zfs/zil.c
@@ -1,4383 +1,4400 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright (c) 2018 Datto Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/arc.h>
 #include <sys/stat.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/dsl_dataset.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab.h>
 #include <sys/trace_zfs.h>
 #include <sys/abd.h>
 #include <sys/brt.h>
 #include <sys/wmsum.h>
 
 /*
  * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system
  * calls that change the file system. Each itx has enough information to
  * be able to replay them after a system crash, power loss, or
  * equivalent failure mode. These are stored in memory until either:
  *
  *   1. they are committed to the pool by the DMU transaction group
  *      (txg), at which point they can be discarded; or
  *   2. they are committed to the on-disk ZIL for the dataset being
  *      modified (e.g. due to an fsync, O_DSYNC, or other synchronous
  *      requirement).
  *
  * In the event of a crash or power loss, the itxs contained by each
  * dataset's on-disk ZIL will be replayed when that dataset is first
  * instantiated (e.g. if the dataset is a normal filesystem, when it is
  * first mounted).
  *
  * As hinted at above, there is one ZIL per dataset (both the in-memory
  * representation, and the on-disk representation). The on-disk format
  * consists of 3 parts:
  *
  * 	- a single, per-dataset, ZIL header; which points to a chain of
  * 	- zero or more ZIL blocks; each of which contains
  * 	- zero or more ZIL records
  *
  * A ZIL record holds the information necessary to replay a single
  * system call transaction. A ZIL block can hold many ZIL records, and
  * the blocks are chained together, similarly to a singly linked list.
  *
  * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL
  * block in the chain, and the ZIL header points to the first block in
  * the chain.
  *
  * Note, there is not a fixed place in the pool to hold these ZIL
  * blocks; they are dynamically allocated and freed as needed from the
  * blocks available on the pool, though they can be preferentially
  * allocated from a dedicated "log" vdev.
  */
 
 /*
  * This controls the amount of time that a ZIL block (lwb) will remain
  * "open" when it isn't "full", and it has a thread waiting for it to be
  * committed to stable storage. Please refer to the zil_commit_waiter()
  * function (and the comments within it) for more details.
  */
 static uint_t zfs_commit_timeout_pct = 10;
 
 /*
  * See zil.h for more information about these fields.
  */
 static zil_kstat_values_t zil_stats = {
 	{ "zil_commit_count",			KSTAT_DATA_UINT64 },
 	{ "zil_commit_writer_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_count",			KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_indirect_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_copied_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_count",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_needcopy_bytes",		KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_normal_alloc",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_count",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_bytes",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_write",	KSTAT_DATA_UINT64 },
 	{ "zil_itx_metaslab_slog_alloc",	KSTAT_DATA_UINT64 },
 };
 
 static zil_sums_t zil_sums_global;
 static kstat_t *zil_kstats_global;
 
 /*
  * Disable intent logging replay.  This global ZIL switch affects all pools.
  */
 int zil_replay_disable = 0;
 
 /*
  * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to
  * the disk(s) by the ZIL after an LWB write has completed. Setting this
  * will cause ZIL corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 static int zil_nocacheflush = 0;
 
 /*
  * Limit SLOG write size per commit executed with synchronous priority.
  * Any writes above that will be executed with lower (asynchronous) priority
  * to limit potential SLOG device abuse by single active ZIL writer.
  */
 static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
 
 static kmem_cache_t *zil_lwb_cache;
 static kmem_cache_t *zil_zcw_cache;
 
 static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx);
 static itx_t *zil_itx_clone(itx_t *oitx);
 static uint64_t zil_max_waste_space(zilog_t *zilog);
 
 static int
 zil_bp_compare(const void *x1, const void *x2)
 {
 	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
 	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2)));
 }
 
 static void
 zil_bp_tree_init(zilog_t *zilog)
 {
 	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
 	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
 zil_bp_tree_fini(zilog_t *zilog)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
 		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
 int
 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_bp_tree;
 	const dva_t *dva;
 	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (0);
 
 	dva = BP_IDENTITY(bp);
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (SET_ERROR(EEXIST));
 
 	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
 	return (0);
 }
 
 static zil_header_t *
 zil_header_in_syncing_context(zilog_t *zilog)
 {
 	return ((zil_header_t *)zilog->zl_header);
 }
 
 static void
 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 {
 	zio_cksum_t *zc = &bp->blk_cksum;
 
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_0],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_0]));
 	(void) random_get_pseudo_bytes((void *)&zc->zc_word[ZIL_ZC_GUID_1],
 	    sizeof (zc->zc_word[ZIL_ZC_GUID_1]));
 	zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os);
 	zc->zc_word[ZIL_ZC_SEQ] = 1ULL;
 }
 
 static int
 zil_kstats_global_update(kstat_t *ksp, int rw)
 {
 	zil_kstat_values_t *zs = ksp->ks_data;
 	ASSERT3P(&zil_stats, ==, zs);
 
 	if (rw == KSTAT_WRITE) {
 		return (SET_ERROR(EACCES));
 	}
 
 	zil_kstat_values_update(zs, &zil_sums_global);
 
 	return (0);
 }
 
 /*
  * Read a log block and make sure it's valid.
  */
 static int
 zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp,
     blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
 	if (!decrypt)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func,
 	    abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
 		 * Validate the checksummed log block.
 		 *
 		 * Sequence numbers should be... sequential.  The checksum
 		 * verifier for the next block should be bp's checksum plus 1.
 		 *
 		 * Also check the log chain linkage and size used.
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		uint64_t size = BP_GET_LSIZE(bp);
 		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
 			zil_chain_t *zilc = (*abuf)->b_data;
 			char *lr = (char *)(zilc + 1);
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    zilc->zc_nused < sizeof (*zilc) ||
 			    zilc->zc_nused > size) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused - sizeof (*zilc);
 				*nbp = zilc->zc_next_blk;
 			}
 		} else {
 			char *lr = (*abuf)->b_data;
 			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
 
 			if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
 			    sizeof (cksum)) ||
 			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
 				error = SET_ERROR(ECKSUM);
 			} else {
 				*begin = lr;
 				*end = lr + zilc->zc_nused;
 				*nbp = zilc->zc_next_blk;
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Read a TX_WRITE log data block.
  */
 static int
 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
 {
 	zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 	const blkptr_t *bp = &lr->lr_blkptr;
 	arc_flags_t aflags = ARC_FLAG_WAIT;
 	arc_buf_t *abuf = NULL;
 	zbookmark_phys_t zb;
 	int error;
 
 	if (BP_IS_HOLE(bp)) {
 		if (wbuf != NULL)
 			memset(wbuf, 0, MAX(BP_GET_LSIZE(bp), lr->lr_length));
 		return (0);
 	}
 
 	if (zilog->zl_header->zh_claim_txg == 0)
 		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
 	/*
 	 * If we are not using the resulting data, we are just checking that
 	 * it hasn't been corrupted so we don't need to waste CPU time
 	 * decompressing and decrypting it.
 	 */
 	if (wbuf == NULL)
 		zio_flags |= ZIO_FLAG_RAW;
 
 	ASSERT3U(BP_GET_LSIZE(bp), !=, 0);
 	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
 	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 
 	error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
 	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
 		if (wbuf != NULL)
 			memcpy(wbuf, abuf->b_data, arc_buf_size(abuf));
 		arc_buf_destroy(abuf, &abuf);
 	}
 
 	return (error);
 }
 
 void
 zil_sums_init(zil_sums_t *zs)
 {
 	wmsum_init(&zs->zil_commit_count, 0);
 	wmsum_init(&zs->zil_commit_writer_count, 0);
 	wmsum_init(&zs->zil_itx_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_count, 0);
 	wmsum_init(&zs->zil_itx_indirect_bytes, 0);
 	wmsum_init(&zs->zil_itx_copied_count, 0);
 	wmsum_init(&zs->zil_itx_copied_bytes, 0);
 	wmsum_init(&zs->zil_itx_needcopy_count, 0);
 	wmsum_init(&zs->zil_itx_needcopy_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_count, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_write, 0);
 	wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0);
 }
 
 void
 zil_sums_fini(zil_sums_t *zs)
 {
 	wmsum_fini(&zs->zil_commit_count);
 	wmsum_fini(&zs->zil_commit_writer_count);
 	wmsum_fini(&zs->zil_itx_count);
 	wmsum_fini(&zs->zil_itx_indirect_count);
 	wmsum_fini(&zs->zil_itx_indirect_bytes);
 	wmsum_fini(&zs->zil_itx_copied_count);
 	wmsum_fini(&zs->zil_itx_copied_bytes);
 	wmsum_fini(&zs->zil_itx_needcopy_count);
 	wmsum_fini(&zs->zil_itx_needcopy_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_count);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_write);
 	wmsum_fini(&zs->zil_itx_metaslab_normal_alloc);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_count);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_bytes);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_write);
 	wmsum_fini(&zs->zil_itx_metaslab_slog_alloc);
 }
 
 void
 zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums)
 {
 	zs->zil_commit_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_count);
 	zs->zil_commit_writer_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_commit_writer_count);
 	zs->zil_itx_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_count);
 	zs->zil_itx_indirect_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_count);
 	zs->zil_itx_indirect_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_indirect_bytes);
 	zs->zil_itx_copied_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_count);
 	zs->zil_itx_copied_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_copied_bytes);
 	zs->zil_itx_needcopy_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_count);
 	zs->zil_itx_needcopy_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_needcopy_bytes);
 	zs->zil_itx_metaslab_normal_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_count);
 	zs->zil_itx_metaslab_normal_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes);
 	zs->zil_itx_metaslab_normal_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_write);
 	zs->zil_itx_metaslab_normal_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc);
 	zs->zil_itx_metaslab_slog_count.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_count);
 	zs->zil_itx_metaslab_slog_bytes.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes);
 	zs->zil_itx_metaslab_slog_write.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_write);
 	zs->zil_itx_metaslab_slog_alloc.value.ui64 =
 	    wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
  */
 int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg,
     boolean_t decrypt)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	boolean_t claimed = !!zh->zh_claim_txg;
 	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
 	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
 	uint64_t max_blk_seq = 0;
 	uint64_t max_lr_seq = 0;
 	uint64_t blk_count = 0;
 	uint64_t lr_count = 0;
 	blkptr_t blk, next_blk = {{{{0}}}};
 	int error = 0;
 
 	/*
 	 * Old logs didn't record the maximum zh_claim_lr_seq.
 	 */
 	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
 		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
 	 * For each block in the chain we strongly check that block to
 	 * ensure its validity.  We stop when an invalid block is found.
 	 * For each block pointer in the chain we call parse_blk_func().
 	 * For each record in each valid block we call parse_lr_func().
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
 	zil_bp_tree_init(zilog);
 
 	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
 		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
 		int reclen;
 		char *lrp, *end;
 		arc_buf_t *abuf = NULL;
 
 		if (blk_seq > claim_blk_seq)
 			break;
 
 		error = parse_blk_func(zilog, &blk, arg, txg);
 		if (error != 0)
 			break;
 		ASSERT3U(max_blk_seq, <, blk_seq);
 		max_blk_seq = blk_seq;
 		blk_count++;
 
 		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
 			break;
 
 		error = zil_read_log_block(zilog, decrypt, &blk, &next_blk,
 		    &lrp, &end, &abuf);
 		if (error != 0) {
 			if (abuf)
 				arc_buf_destroy(abuf, &abuf);
 			if (claimed) {
 				char name[ZFS_MAX_DATASET_NAME_LEN];
 
 				dmu_objset_name(zilog->zl_os, name);
 
 				cmn_err(CE_WARN, "ZFS read log block error %d, "
 				    "dataset %s, seq 0x%llx\n", error, name,
 				    (u_longlong_t)blk_seq);
 			}
 			break;
 		}
 
 		for (; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
+
+			/*
+			 * Are the remaining bytes large enough to hold an
+			 * log record?
+			 */
+			if ((char *)(lr + 1) > end) {
+				cmn_err(CE_WARN, "zil_parse: lr_t overrun");
+				error = SET_ERROR(ECKSUM);
+				arc_buf_destroy(abuf, &abuf);
+				goto done;
+			}
 			reclen = lr->lrc_reclen;
-			ASSERT3U(reclen, >=, sizeof (lr_t));
-			ASSERT3U(reclen, <=, end - lrp);
+			if (reclen < sizeof (lr_t) || reclen > end - lrp) {
+				cmn_err(CE_WARN,
+				    "zil_parse: lr_t has an invalid reclen");
+				error = SET_ERROR(ECKSUM);
+				arc_buf_destroy(abuf, &abuf);
+				goto done;
+			}
+
 			if (lr->lrc_seq > claim_lr_seq) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 
 			error = parse_lr_func(zilog, lr, arg, txg);
 			if (error != 0) {
 				arc_buf_destroy(abuf, &abuf);
 				goto done;
 			}
 			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
 			max_lr_seq = lr->lrc_seq;
 			lr_count++;
 		}
 		arc_buf_destroy(abuf, &abuf);
 	}
 done:
 	zilog->zl_parse_error = error;
 	zilog->zl_parse_blk_seq = max_blk_seq;
 	zilog->zl_parse_lr_seq = max_lr_seq;
 	zilog->zl_parse_blk_count = blk_count;
 	zilog->zl_parse_lr_count = lr_count;
 
 	zil_bp_tree_fini(zilog);
 
 	return (error);
 }
 
 static int
 zil_clear_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	(void) tx;
 	ASSERT(!BP_IS_HOLE(bp));
 
 	/*
 	 * As we call this function from the context of a rewind to a
 	 * checkpoint, each ZIL block whose txg is later than the txg
 	 * that we rewind to is invalid. Thus, we return -1 so
 	 * zil_parse() doesn't attempt to read it.
 	 */
 	if (bp->blk_birth >= first_txg)
 		return (-1);
 
 	if (zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	zio_free(zilog->zl_spa, first_txg, bp);
 	return (0);
 }
 
 static int
 zil_noop_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	(void) zilog, (void) lrc, (void) tx, (void) first_txg;
 	return (0);
 }
 
 static int
 zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t first_txg)
 {
 	/*
 	 * Claim log block if not already committed and not already claimed.
 	 * If tx == NULL, just verify that the block is claimable.
 	 */
 	if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
 	    zil_bp_tree_add(zilog, bp) != 0)
 		return (0);
 
 	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
 	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
 static int
 zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	int error;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If the block is not readable, don't claim it.  This can happen
 	 * in normal operation when a log block is written to disk before
 	 * some of the dmu_sync() blocks it points to.  In this case, the
 	 * transaction cannot have been committed to anyone (we would have
 	 * waited for all writes to be stable first), so it is semantically
 	 * correct to declare this the end of the log.
 	 */
 	if (lr->lr_blkptr.blk_birth >= first_txg) {
 		error = zil_read_log_data(zilog, lr, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 static int
 zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa = zilog->zl_spa;
 	uint_t ii;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	/*
 	 * XXX: Do we need to byteswap lr?
 	 */
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		/*
 		 * When data is embedded into the BP there is no need to create
 		 * BRT entry as there is no data block.  Just copy the BP as it
 		 * contains the data.
 		 */
 		if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 			continue;
 
 		/*
 		 * We can not handle block pointers from the future, since they
 		 * are not yet allocated.  It should not normally happen, but
 		 * just in case lets be safe and just stop here now instead of
 		 * corrupting the pool.
 		 */
 		if (BP_PHYSICAL_BIRTH(bp) >= first_txg)
 			return (SET_ERROR(ENOENT));
 
 		/*
 		 * Assert the block is really allocated before we reference it.
 		 */
 		metaslab_check_free(spa, bp);
 	}
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 		if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp))
 			brt_pending_add(spa, bp, tx);
 	}
 
 	return (0);
 }
 
 static int
 zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t first_txg)
 {
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_claim_write(zilog, lrc, tx, first_txg));
 	case TX_CLONE_RANGE:
 		return (zil_claim_clone_range(zilog, lrc, tx, first_txg));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
     uint64_t claim_txg)
 {
 	(void) claim_txg;
 
 	zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 
 	return (0);
 }
 
 static int
 zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
 {
 	lr_write_t *lr = (lr_write_t *)lrc;
 	blkptr_t *bp = &lr->lr_blkptr;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
 	if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
 	    !BP_IS_HOLE(bp)) {
 		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
 	}
 
 	return (0);
 }
 
 static int
 zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
 {
 	const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
 	const blkptr_t *bp;
 	spa_t *spa;
 	uint_t ii;
 
 	ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (tx == NULL) {
 		return (0);
 	}
 
 	spa = zilog->zl_spa;
 
 	for (ii = 0; ii < lr->lr_nbps; ii++) {
 		bp = &lr->lr_bps[ii];
 
 		if (!BP_IS_HOLE(bp)) {
 			zio_free(spa, dmu_tx_get_txg(tx), bp);
 		}
 	}
 
 	return (0);
 }
 
 static int
 zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
     uint64_t claim_txg)
 {
 
 	if (claim_txg == 0) {
 		return (0);
 	}
 
 	switch (lrc->lrc_txtype) {
 	case TX_WRITE:
 		return (zil_free_write(zilog, lrc, tx, claim_txg));
 	case TX_CLONE_RANGE:
 		return (zil_free_clone_range(zilog, lrc, tx));
 	default:
 		return (0);
 	}
 }
 
 static int
 zil_lwb_vdev_compare(const void *x1, const void *x2)
 {
 	const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev;
 	const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev;
 
 	return (TREE_CMP(v1, v2));
 }
 
 /*
  * Allocate a new lwb.  We may already have a block pointer for it, in which
  * case we get size and version from there.  Or we may not yet, in which case
  * we choose them here and later make the block allocation match.
  */
 static lwb_t *
 zil_alloc_lwb(zilog_t *zilog, int sz, blkptr_t *bp, boolean_t slog,
     uint64_t txg, lwb_state_t state)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	if (bp) {
 		lwb->lwb_blk = *bp;
 		lwb->lwb_slim = (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2);
 		sz = BP_GET_LSIZE(bp);
 	} else {
 		BP_ZERO(&lwb->lwb_blk);
 		lwb->lwb_slim = (spa_version(zilog->zl_spa) >=
 		    SPA_VERSION_SLIM_ZIL);
 	}
 	lwb->lwb_slog = slog;
 	lwb->lwb_error = 0;
 	if (lwb->lwb_slim) {
 		lwb->lwb_nmax = sz;
 		lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t);
 	} else {
 		lwb->lwb_nmax = sz - sizeof (zil_chain_t);
 		lwb->lwb_nused = lwb->lwb_nfilled = 0;
 	}
 	lwb->lwb_sz = sz;
 	lwb->lwb_state = state;
 	lwb->lwb_buf = zio_buf_alloc(sz);
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 	lwb->lwb_root_zio = NULL;
 	lwb->lwb_issued_timestamp = 0;
 	lwb->lwb_issued_txg = 0;
 	lwb->lwb_alloc_txg = txg;
 	lwb->lwb_max_txg = 0;
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_lwb_list, lwb);
 	if (state != LWB_STATE_NEW)
 		zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 
 	return (lwb);
 }
 
 static void
 zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 	    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 	ASSERT3P(lwb->lwb_child_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_write_zio, ==, NULL);
 	ASSERT3P(lwb->lwb_root_zio, ==, NULL);
 	ASSERT3U(lwb->lwb_alloc_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa));
 	VERIFY(list_is_empty(&lwb->lwb_itxs));
 	VERIFY(list_is_empty(&lwb->lwb_waiters));
 	ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 	ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
 
 	/*
 	 * Clear the zilog's field to indicate this lwb is no longer
 	 * valid, and prevent use-after-free errors.
 	 */
 	if (zilog->zl_last_lwb_opened == lwb)
 		zilog->zl_last_lwb_opened = NULL;
 
 	kmem_cache_free(zil_lwb_cache, lwb);
 }
 
 /*
  * Called when we create in-memory log transactions so that we know
  * to cleanup the itxs at the end of spa_sync().
  */
 static void
 zilog_dirty(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	if (ds->ds_is_snapshot)
 		panic("dirtying snapshot!");
 
 	if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
 		/* up the hold count until we can be written out */
 		dmu_buf_add_ref(ds->ds_dbuf, zilog);
 
 		zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg);
 	}
 }
 
 /*
  * Determine if the zil is dirty in the specified txg. Callers wanting to
  * ensure that the dirty state does not change must hold the itxg_lock for
  * the specified txg. Holding the lock will ensure that the zil cannot be
  * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
  * state.
  */
 static boolean_t __maybe_unused
 zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
 		return (B_TRUE);
 	return (B_FALSE);
 }
 
 /*
  * Determine if the zil is dirty. The zil is considered dirty if it has
  * any pending itx records that have not been cleaned by zil_clean().
  */
 static boolean_t
 zilog_is_dirty(zilog_t *zilog)
 {
 	dsl_pool_t *dp = zilog->zl_dmu_pool;
 
 	for (int t = 0; t < TXG_SIZE; t++) {
 		if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t))
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Its called in zil_commit context (zil_process_commit_list()/zil_create()).
  * It activates SPA_FEATURE_ZILSAXATTR feature, if its enabled.
  * Check dsl_dataset_feature_is_active to avoid txg_wait_synced() on every
  * zil_commit.
  */
 static void
 zil_commit_activate_saxattr_feature(zilog_t *zilog)
 {
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 
 	if (spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL &&
 	    !dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(ds, tx);
 		txg = dmu_tx_get_txg(tx);
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 		    (void *)B_TRUE;
 		mutex_exit(&ds->ds_lock);
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	}
 }
 
 /*
  * Create an on-disk intent log.
  */
 static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
 	boolean_t slog = FALSE;
 	dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	ASSERT(zh->zh_claim_txg == 0);
 	ASSERT(zh->zh_replay_seq == 0);
 
 	blk = zh->zh_log;
 
 	/*
 	 * Allocate an initial log block if:
 	 *    - there isn't one already
 	 *    - the existing block is the wrong endianness
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
 		VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
 			zio_free(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk,
 		    ZIL_MIN_BLKSZ, &slog);
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
 	}
 
 	/*
 	 * Allocate a log write block (lwb) for the first log block.
 	 */
 	if (error == 0)
 		lwb = zil_alloc_lwb(zilog, 0, &blk, slog, txg, LWB_STATE_NEW);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
 	 * and wait for zil_sync() to stuff the block pointer into zh_log.
 	 * (zh is part of the MOS, so we cannot modify it in open context.)
 	 */
 	if (tx != NULL) {
 		/*
 		 * If "zilsaxattr" feature is enabled on zpool, then activate
 		 * it now when we're creating the ZIL chain. We can't wait with
 		 * this until we write the first xattr log record because we
 		 * need to wait for the feature activation to sync out.
 		 */
 		if (spa_feature_is_enabled(zilog->zl_spa,
 		    SPA_FEATURE_ZILSAXATTR) && dmu_objset_type(zilog->zl_os) !=
 		    DMU_OST_ZVOL) {
 			mutex_enter(&ds->ds_lock);
 			ds->ds_feature_activation[SPA_FEATURE_ZILSAXATTR] =
 			    (void *)B_TRUE;
 			mutex_exit(&ds->ds_lock);
 		}
 
 		dmu_tx_commit(tx);
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 	} else {
 		/*
 		 * This branch covers the case where we enable the feature on a
 		 * zpool that has existing ZIL headers.
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 	}
 	IMPLY(spa_feature_is_enabled(zilog->zl_spa, SPA_FEATURE_ZILSAXATTR) &&
 	    dmu_objset_type(zilog->zl_os) != DMU_OST_ZVOL,
 	    dsl_dataset_feature_is_active(ds, SPA_FEATURE_ZILSAXATTR));
 
 	ASSERT(error != 0 || memcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
 	IMPLY(error == 0, lwb != NULL);
 
 	return (lwb);
 }
 
 /*
  * In one tx, free all log blocks and clear the log header. If keep_first
  * is set, then we're replaying a log with no content. We want to keep the
  * first block, however, so that the first synchronous transaction doesn't
  * require a txg_wait_synced() in zil_create(). We don't need to
  * txg_wait_synced() here either when keep_first is set, because both
  * zil_create() and zil_destroy() will wait for any in-progress destroys
  * to complete.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_destroy(zilog_t *zilog, boolean_t keep_first)
 {
 	const zil_header_t *zh = zilog->zl_header;
 	lwb_t *lwb;
 	dmu_tx_t *tx;
 	uint64_t txg;
 
 	/*
 	 * Wait for any previous destroy to complete.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
 	zilog->zl_old_header = *zh;		/* debugging aid */
 
 	if (BP_IS_HOLE(&zh->zh_log))
 		return (B_FALSE);
 
 	tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
 
 	if (!list_is_empty(&zilog->zl_lwb_list)) {
 		ASSERT(zh->zh_claim_txg == 0);
 		VERIFY(!keep_first);
 		while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) {
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 			if (!BP_IS_HOLE(&lwb->lwb_blk))
 				zio_free(zilog->zl_spa, txg, &lwb->lwb_blk);
 			zil_free_lwb(zilog, lwb);
 		}
 	} else if (!keep_first) {
 		zil_destroy_sync(zilog, tx);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 
 	return (B_TRUE);
 }
 
 void
 zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	(void) zil_parse(zilog, zil_free_log_block,
 	    zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE);
 }
 
 int
 zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	zilog_t *zilog;
 	uint64_t first_txg;
 	zil_header_t *zh;
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_own_obj(dp, ds->ds_object,
 	    DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os);
 	if (error != 0) {
 		/*
 		 * EBUSY indicates that the objset is inconsistent, in which
 		 * case it can not have a ZIL.
 		 */
 		if (error != EBUSY) {
 			cmn_err(CE_WARN, "can't open objset for %llu, error %u",
 			    (unsigned long long)ds->ds_object, error);
 		}
 
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 	ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa));
 	first_txg = spa_min_claim_txg(zilog->zl_spa);
 
 	/*
 	 * If the spa_log_state is not set to be cleared, check whether
 	 * the current uberblock is a checkpoint one and if the current
 	 * header has been claimed before moving on.
 	 *
 	 * If the current uberblock is a checkpointed uberblock then
 	 * one of the following scenarios took place:
 	 *
 	 * 1] We are currently rewinding to the checkpoint of the pool.
 	 * 2] We crashed in the middle of a checkpoint rewind but we
 	 *    did manage to write the checkpointed uberblock to the
 	 *    vdev labels, so when we tried to import the pool again
 	 *    the checkpointed uberblock was selected from the import
 	 *    procedure.
 	 *
 	 * In both cases we want to zero out all the ZIL blocks, except
 	 * the ones that have been claimed at the time of the checkpoint
 	 * (their zh_claim_txg != 0). The reason is that these blocks
 	 * may be corrupted since we may have reused their locations on
 	 * disk after we took the checkpoint.
 	 *
 	 * We could try to set spa_log_state to SPA_LOG_CLEAR earlier
 	 * when we first figure out whether the current uberblock is
 	 * checkpointed or not. Unfortunately, that would discard all
 	 * the logs, including the ones that are claimed, and we would
 	 * leak space.
 	 */
 	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR ||
 	    (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 	    zh->zh_claim_txg == 0)) {
 		if (!BP_IS_HOLE(&zh->zh_log)) {
 			(void) zil_parse(zilog, zil_clear_log_block,
 			    zil_noop_log_record, tx, first_txg, B_FALSE);
 		}
 		BP_ZERO(&zh->zh_log);
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 		dmu_objset_disown(os, B_FALSE, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we are not rewinding and opening the pool normally, then
 	 * the min_claim_txg should be equal to the first txg of the pool.
 	 */
 	ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa));
 
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
 	 * read only part of the log now (e.g. due to a missing device),
 	 * but we can read the entire log later, we will not try to replay
 	 * or destroy beyond the last block we successfully claimed.
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
 		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg, B_FALSE);
 		zh->zh_claim_txg = first_txg;
 		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
 		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
 		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
 			zh->zh_flags |= ZIL_REPLAY_NEEDED;
 		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		if (os->os_encrypted)
 			os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
 	dmu_objset_disown(os, B_FALSE, FTAG);
 	return (0);
 }
 
 /*
  * Check the log by walking the log chain.
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
 int
 zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
 {
 	(void) dp;
 	zilog_t *zilog;
 	objset_t *os;
 	blkptr_t *bp;
 	int error;
 
 	ASSERT(tx == NULL);
 
 	error = dmu_objset_from_ds(ds, &os);
 	if (error != 0) {
 		cmn_err(CE_WARN, "can't open objset %llu, error %d",
 		    (unsigned long long)ds->ds_object, error);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
 	bp = (blkptr_t *)&zilog->zl_header->zh_log;
 
 	if (!BP_IS_HOLE(bp)) {
 		vdev_t *vd;
 		boolean_t valid = B_TRUE;
 
 		/*
 		 * Check the first block and determine if it's on a log device
 		 * which may have been removed or faulted prior to loading this
 		 * pool.  If so, there's no point in checking the rest of the
 		 * log as its content should have already been synced to the
 		 * pool.
 		 */
 		spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER);
 		vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0]));
 		if (vd->vdev_islog && vdev_is_dead(vd))
 			valid = vdev_log_state_valid(vd);
 		spa_config_exit(os->os_spa, SCL_STATE, FTAG);
 
 		if (!valid)
 			return (0);
 
 		/*
 		 * Check whether the current uberblock is checkpointed (e.g.
 		 * we are rewinding) and whether the current header has been
 		 * claimed or not. If it hasn't then skip verifying it. We
 		 * do this because its ZIL blocks may be part of the pool's
 		 * state before the rewind, which is no longer valid.
 		 */
 		zil_header_t *zh = zil_header_in_syncing_context(zilog);
 		if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 &&
 		    zh->zh_claim_txg == 0)
 			return (0);
 	}
 
 	/*
 	 * Because tx == NULL, zil_claim_log_block() will not actually claim
 	 * any blocks, but just determine whether it is possible to do so.
 	 * In addition to checking the log chain, zil_claim_log_block()
 	 * will invoke zio_claim() with a done func of spa_claim_notify(),
 	 * which will update spa_max_claim_txg.  See spa_load() for details.
 	 */
 	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
 	    zilog->zl_header->zh_claim_txg ? -1ULL :
 	    spa_min_claim_txg(os->os_spa), B_FALSE);
 
 	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 /*
  * When an itx is "skipped", this function is used to properly mark the
  * waiter as "done, and signal any thread(s) waiting on it. An itx can
  * be skipped (and not committed to an lwb) for a variety of reasons,
  * one of them being that the itx was committed via spa_sync(), prior to
  * it being committed to an lwb; this can happen if a thread calling
  * zil_commit() is racing with spa_sync().
  */
 static void
 zil_commit_waiter_skip(zil_commit_waiter_t *zcw)
 {
 	mutex_enter(&zcw->zcw_lock);
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 	zcw->zcw_done = B_TRUE;
 	cv_broadcast(&zcw->zcw_cv);
 	mutex_exit(&zcw->zcw_lock);
 }
 
 /*
  * This function is used when the given waiter is to be linked into an
  * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb.
  * At this point, the waiter will no longer be referenced by the itx,
  * and instead, will be referenced by the lwb.
  */
 static void
 zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb)
 {
 	/*
 	 * The lwb_waiters field of the lwb is protected by the zilog's
 	 * zl_issuer_lock while the lwb is open and zl_lock otherwise.
 	 * zl_issuer_lock also protects leaving the open state.
 	 * zcw_lwb setting is protected by zl_issuer_lock and state !=
 	 * flush_done, which transition is protected by zl_lock.
 	 */
 	ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_issuer_lock));
 	IMPLY(lwb->lwb_state != LWB_STATE_OPENED,
 	    MUTEX_HELD(&lwb->lwb_zilog->zl_lock));
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(&lwb->lwb_waiters, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	zcw->zcw_lwb = lwb;
 }
 
 /*
  * This function is used when zio_alloc_zil() fails to allocate a ZIL
  * block, and the given waiter must be linked to the "nolwb waiters"
  * list inside of zil_process_commit_list().
  */
 static void
 zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	list_insert_tail(nolwb, zcw);
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 }
 
 void
 zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp)
 {
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	avl_index_t where;
 	zil_vdev_node_t *zv, zvsearch;
 	int ndvas = BP_GET_NDVAS(bp);
 	int i;
 
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	if (zil_nocacheflush)
 		return;
 
 	mutex_enter(&lwb->lwb_vdev_lock);
 	for (i = 0; i < ndvas; i++) {
 		zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
 		if (avl_find(t, &zvsearch, &where) == NULL) {
 			zv = kmem_alloc(sizeof (*zv), KM_SLEEP);
 			zv->zv_vdev = zvsearch.zv_vdev;
 			avl_insert(t, zv, where);
 		}
 	}
 	mutex_exit(&lwb->lwb_vdev_lock);
 }
 
 static void
 zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb)
 {
 	avl_tree_t *src = &lwb->lwb_vdev_tree;
 	avl_tree_t *dst = &nlwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE);
 	ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE);
 
 	/*
 	 * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does
 	 * not need the protection of lwb_vdev_lock (it will only be modified
 	 * while holding zilog->zl_lock) as its writes and those of its
 	 * children have all completed.  The younger 'nlwb' may be waiting on
 	 * future writes to additional vdevs.
 	 */
 	mutex_enter(&nlwb->lwb_vdev_lock);
 	/*
 	 * Tear down the 'lwb' vdev tree, ensuring that entries which do not
 	 * exist in 'nlwb' are moved to it, freeing any would-be duplicates.
 	 */
 	while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) {
 		avl_index_t where;
 
 		if (avl_find(dst, zv, &where) == NULL) {
 			avl_insert(dst, zv, where);
 		} else {
 			kmem_free(zv, sizeof (*zv));
 		}
 	}
 	mutex_exit(&nlwb->lwb_vdev_lock);
 }
 
 void
 zil_lwb_add_txg(lwb_t *lwb, uint64_t txg)
 {
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 }
 
 /*
  * This function is a called after all vdevs associated with a given lwb
  * write have completed their DKIOCFLUSHWRITECACHE command; or as soon
  * as the lwb write completes, if "zil_nocacheflush" is set. Further,
  * all "previous" lwb's will have completed before this function is
  * called; i.e. this function is called for all previous lwbs before
  * it's called for "this" lwb (enforced via zio the dependencies
  * configured in zil_lwb_set_zio_dependency()).
  *
  * The intention is for this function to be called as soon as the
  * contents of an lwb are considered "stable" on disk, and will survive
  * any sudden loss of power. At this point, any threads waiting for the
  * lwb to reach this state are signalled, and the "waiter" structures
  * are marked "done".
  */
 static void
 zil_lwb_flush_vdevs_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
 	zil_commit_waiter_t *zcw;
 	itx_t *itx;
 
 	spa_config_exit(zilog->zl_spa, SCL_STATE, lwb);
 
 	hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp;
 
 	mutex_enter(&zilog->zl_lock);
 
 	zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8;
 
 	lwb->lwb_root_zio = NULL;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	lwb->lwb_state = LWB_STATE_FLUSH_DONE;
 
 	if (zilog->zl_last_lwb_opened == lwb) {
 		/*
 		 * Remember the highest committed log sequence number
 		 * for ztest. We only update this value when all the log
 		 * writes succeeded, because ztest wants to ASSERT that
 		 * it got the whole log chain.
 		 */
 		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 	}
 
 	while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL)
 		zil_itx_destroy(itx);
 
 	while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) {
 		mutex_enter(&zcw->zcw_lock);
 
 		ASSERT3P(zcw->zcw_lwb, ==, lwb);
 		zcw->zcw_lwb = NULL;
 		/*
 		 * We expect any ZIO errors from child ZIOs to have been
 		 * propagated "up" to this specific LWB's root ZIO, in
 		 * order for this error handling to work correctly. This
 		 * includes ZIO errors from either this LWB's write or
 		 * flush, as well as any errors from other dependent LWBs
 		 * (e.g. a root LWB ZIO that might be a child of this LWB).
 		 *
 		 * With that said, it's important to note that LWB flush
 		 * errors are not propagated up to the LWB root ZIO.
 		 * This is incorrect behavior, and results in VDEV flush
 		 * errors not being handled correctly here. See the
 		 * comment above the call to "zio_flush" for details.
 		 */
 
 		zcw->zcw_zio_error = zio->io_error;
 
 		ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 		zcw->zcw_done = B_TRUE;
 		cv_broadcast(&zcw->zcw_cv);
 
 		mutex_exit(&zcw->zcw_lock);
 	}
 
 	uint64_t txg = lwb->lwb_issued_txg;
 
 	/* Once we drop the lock, lwb may be freed by zil_sync(). */
 	mutex_exit(&zilog->zl_lock);
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
 	zilog->zl_lwb_inflight[txg & TXG_MASK]--;
 	if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
 		cv_broadcast(&zilog->zl_lwb_io_cv);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 }
 
 /*
  * Wait for the completion of all issued write/flush of that txg provided.
  * It guarantees zil_lwb_flush_vdevs_done() is called and returned.
  */
 static void
 zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg)
 {
 	ASSERT3U(txg, ==, spa_syncing_txg(zilog->zl_spa));
 
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	while (zilog->zl_lwb_inflight[txg & TXG_MASK] > 0)
 		cv_wait(&zilog->zl_lwb_io_cv, &zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 #ifdef ZFS_DEBUG
 	mutex_enter(&zilog->zl_lock);
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb_t *lwb = list_head(&zilog->zl_lwb_list);
 	while (lwb != NULL) {
 		if (lwb->lwb_issued_txg <= txg) {
 			ASSERT(lwb->lwb_state != LWB_STATE_ISSUED);
 			ASSERT(lwb->lwb_state != LWB_STATE_WRITE_DONE);
 			IMPLY(lwb->lwb_issued_txg > 0,
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 		}
 		IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 		    lwb->lwb_state == LWB_STATE_FLUSH_DONE,
 		    lwb->lwb_buf == NULL);
 		lwb = list_next(&zilog->zl_lwb_list, lwb);
 	}
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	mutex_exit(&zilog->zl_lock);
 #endif
 }
 
 /*
  * This is called when an lwb's write zio completes. The callback's
  * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs
  * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved
  * in writing out this specific lwb's data, and in the case that cache
  * flushes have been deferred, vdevs involved in writing the data for
  * previous lwbs. The writes corresponding to all the vdevs in the
  * lwb_vdev_tree will have completed by the time this is called, due to
  * the zio dependencies configured in zil_lwb_set_zio_dependency(),
  * which takes deferred flushes into account. The lwb will be "done"
  * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio
  * completion callback for the lwb's root zio.
  */
 static void
 zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	spa_t *spa = zio->io_spa;
 	zilog_t *zilog = lwb->lwb_zilog;
 	avl_tree_t *t = &lwb->lwb_vdev_tree;
 	void *cookie = NULL;
 	zil_vdev_node_t *zv;
 	lwb_t *nlwb;
 
 	ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0);
 
 	abd_free(zio->io_abd);
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	lwb->lwb_buf = NULL;
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED);
 	lwb->lwb_state = LWB_STATE_WRITE_DONE;
 	lwb->lwb_child_zio = NULL;
 	lwb->lwb_write_zio = NULL;
 
 	/*
 	 * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not
 	 * called for it yet, and when it will be, it won't be able to make
 	 * its write ZIO a parent this ZIO.  In such case we can not defer
 	 * our flushes or below may be a race between the done callbacks.
 	 */
 	nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED)
 		nlwb = NULL;
 	mutex_exit(&zilog->zl_lock);
 
 	if (avl_numnodes(t) == 0)
 		return;
 
 	/*
 	 * If there was an IO error, we're not going to call zio_flush()
 	 * on these vdevs, so we simply empty the tree and free the
 	 * nodes. We avoid calling zio_flush() since there isn't any
 	 * good reason for doing so, after the lwb block failed to be
 	 * written out.
 	 *
 	 * Additionally, we don't perform any further error handling at
 	 * this point (e.g. setting "zcw_zio_error" appropriately), as
 	 * we expect that to occur in "zil_lwb_flush_vdevs_done" (thus,
 	 * we expect any error seen here, to have been propagated to
 	 * that function).
 	 */
 	if (zio->io_error != 0) {
 		while ((zv = avl_destroy_nodes(t, &cookie)) != NULL)
 			kmem_free(zv, sizeof (*zv));
 		return;
 	}
 
 	/*
 	 * If this lwb does not have any threads waiting for it to
 	 * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE
 	 * command to the vdevs written to by "this" lwb, and instead
 	 * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE
 	 * command for those vdevs. Thus, we merge the vdev tree of
 	 * "this" lwb with the vdev tree of the "next" lwb in the list,
 	 * and assume the "next" lwb will handle flushing the vdevs (or
 	 * deferring the flush(s) again).
 	 *
 	 * This is a useful performance optimization, especially for
 	 * workloads with lots of async write activity and few sync
 	 * write and/or fsync activity, as it has the potential to
 	 * coalesce multiple flush commands to a vdev into one.
 	 */
 	if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) {
 		zil_lwb_flush_defer(lwb, nlwb);
 		ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
 		return;
 	}
 
 	while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
 		vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
 		if (vd != NULL) {
 			/*
 			 * The "ZIO_FLAG_DONT_PROPAGATE" is currently
 			 * always used within "zio_flush". This means,
 			 * any errors when flushing the vdev(s), will
 			 * (unfortunately) not be handled correctly,
 			 * since these "zio_flush" errors will not be
 			 * propagated up to "zil_lwb_flush_vdevs_done".
 			 */
 			zio_flush(lwb->lwb_root_zio, vd);
 		}
 		kmem_free(zv, sizeof (*zv));
 	}
 }
 
 /*
  * Build the zio dependency chain, which is used to preserve the ordering of
  * lwb completions that is required by the semantics of the ZIL. Each new lwb
  * zio becomes a parent of the previous lwb zio, such that the new lwb's zio
  * cannot complete until the previous lwb's zio completes.
  *
  * This is required by the semantics of zil_commit(): the commit waiters
  * attached to the lwbs will be woken in the lwb zio's completion callback,
  * so this zio dependency graph ensures the waiters are woken in the correct
  * order (the same order the lwbs were created).
  */
 static void
 zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_lock));
 
 	lwb_t *prev_lwb = list_prev(&zilog->zl_lwb_list, lwb);
 	if (prev_lwb == NULL ||
 	    prev_lwb->lwb_state == LWB_STATE_FLUSH_DONE)
 		return;
 
 	/*
 	 * If the previous lwb's write hasn't already completed, we also want
 	 * to order the completion of the lwb write zios (above, we only order
 	 * the completion of the lwb root zios). This is required because of
 	 * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 	 *
 	 * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous
 	 * lwb will rely on this lwb to flush the vdevs written to by that
 	 * previous lwb. Thus, we need to ensure this lwb doesn't issue the
 	 * flush until after the previous lwb's write completes. We ensure
 	 * this ordering by setting the zio parent/child relationship here.
 	 *
 	 * Without this relationship on the lwb's write zio, it's possible
 	 * for this lwb's write to complete prior to the previous lwb's write
 	 * completing; and thus, the vdevs for the previous lwb would be
 	 * flushed prior to that lwb's data being written to those vdevs (the
 	 * vdevs are flushed in the lwb write zio's completion handler,
 	 * zil_lwb_write_done()).
 	 */
 	if (prev_lwb->lwb_state == LWB_STATE_ISSUED) {
 		ASSERT3P(prev_lwb->lwb_write_zio, !=, NULL);
 		zio_add_child(lwb->lwb_write_zio, prev_lwb->lwb_write_zio);
 	} else {
 		ASSERT3S(prev_lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
 	}
 
 	ASSERT3P(prev_lwb->lwb_root_zio, !=, NULL);
 	zio_add_child(lwb->lwb_root_zio, prev_lwb->lwb_root_zio);
 }
 
 
 /*
  * This function's purpose is to "open" an lwb such that it is ready to
  * accept new itxs being committed to it. This function is idempotent; if
  * the passed in lwb has already been opened, it is essentially a no-op.
  */
 static void
 zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb)
 {
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (lwb->lwb_state != LWB_STATE_NEW) {
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 		return;
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_OPENED;
 	zilog->zl_last_lwb_opened = lwb;
 	mutex_exit(&zilog->zl_lock);
 }
 
 /*
  * Maximum block size used by the ZIL.  This is picked up when the ZIL is
  * initialized.  Otherwise this should not be used directly; see
  * zl_max_block_size instead.
  */
 static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE;
 
 /*
  * Plan splitting of the provided burst size between several blocks.
  */
 static uint_t
 zil_lwb_plan(zilog_t *zilog, uint64_t size, uint_t *minsize)
 {
 	uint_t md = zilog->zl_max_block_size - sizeof (zil_chain_t);
 
 	if (size <= md) {
 		/*
 		 * Small bursts are written as-is in one block.
 		 */
 		*minsize = size;
 		return (size);
 	} else if (size > 8 * md) {
 		/*
 		 * Big bursts use maximum blocks.  The first block size
 		 * is hard to predict, but it does not really matter.
 		 */
 		*minsize = 0;
 		return (md);
 	}
 
 	/*
 	 * Medium bursts try to divide evenly to better utilize several SLOG
 	 * VDEVs.  The first block size we predict assuming the worst case of
 	 * maxing out others.  Fall back to using maximum blocks if due to
 	 * large records or wasted space we can not predict anything better.
 	 */
 	uint_t s = size;
 	uint_t n = DIV_ROUND_UP(s, md - sizeof (lr_write_t));
 	uint_t chunk = DIV_ROUND_UP(s, n);
 	uint_t waste = zil_max_waste_space(zilog);
 	waste = MAX(waste, zilog->zl_cur_max);
 	if (chunk <= md - waste) {
 		*minsize = MAX(s - (md - waste) * (n - 1), waste);
 		return (chunk);
 	} else {
 		*minsize = 0;
 		return (md);
 	}
 }
 
 /*
  * Try to predict next block size based on previous history.  Make prediction
  * sufficient for 7 of 8 previous bursts.  Don't try to save if the saving is
  * less then 50%, extra writes may cost more, but we don't want single spike
  * to badly affect our predictions.
  */
 static uint_t
 zil_lwb_predict(zilog_t *zilog)
 {
 	uint_t m, o;
 
 	/* If we are in the middle of a burst, take it into account also. */
 	if (zilog->zl_cur_size > 0) {
 		o = zil_lwb_plan(zilog, zilog->zl_cur_size, &m);
 	} else {
 		o = UINT_MAX;
 		m = 0;
 	}
 
 	/* Find minimum optimal size.  We don't need to go below that. */
 	for (int i = 0; i < ZIL_BURSTS; i++)
 		o = MIN(o, zilog->zl_prev_opt[i]);
 
 	/* Find two biggest minimal first block sizes above the optimal. */
 	uint_t m1 = MAX(m, o), m2 = o;
 	for (int i = 0; i < ZIL_BURSTS; i++) {
 		m = zilog->zl_prev_min[i];
 		if (m >= m1) {
 			m2 = m1;
 			m1 = m;
 		} else if (m > m2) {
 			m2 = m;
 		}
 	}
 
 	/*
 	 * If second minimum size gives 50% saving -- use it.  It may cost us
 	 * one additional write later, but the space saving is just too big.
 	 */
 	return ((m1 < m2 * 2) ? m1 : m2);
 }
 
 /*
  * Close the log block for being issued and allocate the next one.
  * Has to be called under zl_issuer_lock to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb, lwb_state_t state)
 {
 	uint64_t blksz, plan, plan2;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED);
 	lwb->lwb_state = LWB_STATE_CLOSED;
 
 	/*
 	 * If there was an allocation failure then returned NULL will trigger
 	 * zil_commit_writer_stall() at the caller.  This is inherently racy,
 	 * since allocation may not have happened yet.
 	 */
 	if (lwb->lwb_error != 0)
 		return (NULL);
 
 	/*
 	 * Log blocks are pre-allocated.  Here we select the size of the next
 	 * block, based on what's left of this burst and the previous history.
 	 * While we try to only write used part of the block, we can't just
 	 * always allocate the maximum block size because we can exhaust all
 	 * available pool log space, so we try to be reasonable.
 	 */
 	if (zilog->zl_cur_left > 0) {
 		/*
 		 * We are in the middle of a burst and know how much is left.
 		 * But if workload is multi-threaded there may be more soon.
 		 * Try to predict what can it be and plan for the worst case.
 		 */
 		uint_t m;
 		plan = zil_lwb_plan(zilog, zilog->zl_cur_left, &m);
 		if (zilog->zl_parallel) {
 			plan2 = zil_lwb_plan(zilog, zilog->zl_cur_left +
 			    zil_lwb_predict(zilog), &m);
 			if (plan < plan2)
 				plan = plan2;
 		}
 	} else {
 		/*
 		 * The previous burst is done and we can only predict what
 		 * will come next.
 		 */
 		plan = zil_lwb_predict(zilog);
 	}
 	blksz = plan + sizeof (zil_chain_t);
 	blksz = P2ROUNDUP_TYPED(blksz, ZIL_MIN_BLKSZ, uint64_t);
 	blksz = MIN(blksz, zilog->zl_max_block_size);
 	DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, uint64_t, blksz,
 	    uint64_t, plan);
 
 	return (zil_alloc_lwb(zilog, blksz, NULL, 0, 0, state));
 }
 
 /*
  * Finalize previously closed block and issue the write zio.
  */
 static void
 zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 {
 	spa_t *spa = zilog->zl_spa;
 	zil_chain_t *zilc;
 	boolean_t slog;
 	zbookmark_phys_t zb;
 	zio_priority_t prio;
 	int error;
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	/* Actually fill the lwb with the data. */
 	for (itx_t *itx = list_head(&lwb->lwb_itxs); itx;
 	    itx = list_next(&lwb->lwb_itxs, itx))
 		zil_lwb_commit(zilog, lwb, itx);
 	lwb->lwb_nused = lwb->lwb_nfilled;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 
 	lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * The lwb is now ready to be issued, but it can be only if it already
 	 * got its block pointer allocated or the allocation has failed.
 	 * Otherwise leave it as-is, relying on some other thread to issue it
 	 * after allocating its block pointer via calling zil_lwb_write_issue()
 	 * for the previous lwb(s) in the chain.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_state = LWB_STATE_READY;
 	if (BP_IS_HOLE(&lwb->lwb_blk) && lwb->lwb_error == 0) {
 		mutex_exit(&zilog->zl_lock);
 		return;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 next_lwb:
 	if (lwb->lwb_slim)
 		zilc = (zil_chain_t *)lwb->lwb_buf;
 	else
 		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_nmax);
 	int wsz = lwb->lwb_sz;
 	if (lwb->lwb_error == 0) {
 		abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, lwb->lwb_sz);
 		if (!lwb->lwb_slog || zilog->zl_cur_size <= zil_slog_bulk)
 			prio = ZIO_PRIORITY_SYNC_WRITE;
 		else
 			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 		    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
 		    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 		lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, spa, 0,
 		    &lwb->lwb_blk, lwb_abd, lwb->lwb_sz, zil_lwb_write_done,
 		    lwb, prio, ZIO_FLAG_CANFAIL, &zb);
 		zil_lwb_add_block(lwb, &lwb->lwb_blk);
 
 		if (lwb->lwb_slim) {
 			/* For Slim ZIL only write what is used. */
 			wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ,
 			    int);
 			ASSERT3S(wsz, <=, lwb->lwb_sz);
 			zio_shrink(lwb->lwb_write_zio, wsz);
 			wsz = lwb->lwb_write_zio->io_size;
 		}
 		memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused);
 		zilc->zc_pad = 0;
 		zilc->zc_nused = lwb->lwb_nused;
 		zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 	} else {
 		/*
 		 * We can't write the lwb if there was an allocation failure,
 		 * so create a null zio instead just to maintain dependencies.
 		 */
 		lwb->lwb_write_zio = zio_null(lwb->lwb_root_zio, spa, NULL,
 		    zil_lwb_write_done, lwb, ZIO_FLAG_CANFAIL);
 		lwb->lwb_write_zio->io_error = lwb->lwb_error;
 	}
 	if (lwb->lwb_child_zio)
 		zio_add_child(lwb->lwb_write_zio, lwb->lwb_child_zio);
 
 	/*
 	 * Open transaction to allocate the next block pointer.
 	 */
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	uint64_t txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * Allocate next the block pointer unless we are already in error.
 	 */
 	lwb_t *nlwb = list_next(&zilog->zl_lwb_list, lwb);
 	blkptr_t *bp = &zilc->zc_next_blk;
 	BP_ZERO(bp);
 	error = lwb->lwb_error;
 	if (error == 0) {
 		error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, nlwb->lwb_sz,
 		    &slog);
 	}
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		BP_SET_CHECKSUM(bp, nlwb->lwb_slim ? ZIO_CHECKSUM_ZILOG2 :
 		    ZIO_CHECKSUM_ZILOG);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
 		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 	}
 
 	/*
 	 * Reduce TXG open time by incrementing inflight counter and committing
 	 * the transaciton.  zil_sync() will wait for it to return to zero.
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	lwb->lwb_issued_txg = txg;
 	zilog->zl_lwb_inflight[txg & TXG_MASK]++;
 	zilog->zl_lwb_max_issued_txg = MAX(txg, zilog->zl_lwb_max_issued_txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 	dmu_tx_commit(tx);
 
 	spa_config_enter(spa, SCL_STATE, lwb, RW_READER);
 
 	/*
 	 * We've completed all potentially blocking operations.  Update the
 	 * nlwb and allow it proceed without possible lock order reversals.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	zil_lwb_set_zio_dependency(zilog, lwb);
 	lwb->lwb_state = LWB_STATE_ISSUED;
 
 	if (nlwb) {
 		nlwb->lwb_blk = *bp;
 		nlwb->lwb_error = error;
 		nlwb->lwb_slog = slog;
 		nlwb->lwb_alloc_txg = txg;
 		if (nlwb->lwb_state != LWB_STATE_READY)
 			nlwb = NULL;
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	if (lwb->lwb_slog) {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	} else {
 		ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes,
 		    lwb->lwb_nused);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write,
 		    wsz);
 		ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc,
 		    BP_GET_LSIZE(&lwb->lwb_blk));
 	}
 	lwb->lwb_issued_timestamp = gethrtime();
 	if (lwb->lwb_child_zio)
 		zio_nowait(lwb->lwb_child_zio);
 	zio_nowait(lwb->lwb_write_zio);
 	zio_nowait(lwb->lwb_root_zio);
 
 	/*
 	 * If nlwb was ready when we gave it the block pointer,
 	 * it is on us to issue it and possibly following ones.
 	 */
 	lwb = nlwb;
 	if (lwb)
 		goto next_lwb;
 }
 
 /*
  * Maximum amount of data that can be put into single log block.
  */
 uint64_t
 zil_max_log_data(zilog_t *zilog, size_t hdrsize)
 {
 	return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
 }
 
 /*
  * Maximum amount of log space we agree to waste to reduce number of
  * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~6%).
  */
 static inline uint64_t
 zil_max_waste_space(zilog_t *zilog)
 {
 	return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 16);
 }
 
 /*
  * Maximum amount of write data for WR_COPIED.  For correctness, consumers
  * must fall back to WR_NEED_COPY if we can't fit the entire record into one
  * maximum sized log block, because each WR_COPIED record must fit in a
  * single log block.  Below that it is a tradeoff of additional memory copy
  * and possibly worse log space efficiency vs additional range lock/unlock.
  */
 static uint_t zil_maxcopied = 7680;
 
 uint64_t
 zil_max_copied_data(zilog_t *zilog)
 {
 	uint64_t max_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	return (MIN(max_data, zil_maxcopied));
 }
 
 static uint64_t
 zil_itx_record_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return (0);
 	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
 	return (lr->lrc_reclen);
 }
 
 static uint64_t
 zil_itx_data_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 	lr_write_t *lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) {
 		ASSERT3U(lr->lrc_reclen, ==, sizeof (lr_write_t));
 		return (P2ROUNDUP_TYPED(lrw->lr_length, sizeof (uint64_t),
 		    uint64_t));
 	}
 	return (0);
 }
 
 static uint64_t
 zil_itx_full_size(itx_t *itx)
 {
 	lr_t *lr = &itx->itx_lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return (0);
 	ASSERT3U(lr->lrc_reclen, >=, sizeof (lr_t));
 	return (lr->lrc_reclen + zil_itx_data_size(itx));
 }
 
 /*
  * Estimate space needed in the lwb for the itx.  Allocate more lwbs or
  * split the itx as needed, but don't touch the actual transaction data.
  * Has to be called under zl_issuer_lock to call zil_lwb_write_close()
  * to chain more lwbs.
  */
 static lwb_t *
 zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs)
 {
 	itx_t *citx;
 	lr_t *lr, *clr;
 	lr_write_t *lrw;
 	uint64_t dlen, dnow, lwb_sp, reclen, max_log_data;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3P(lwb->lwb_buf, !=, NULL);
 
 	zil_lwb_write_open(zilog, lwb);
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	/*
 	 * A commit itx doesn't represent any on-disk state; instead
 	 * it's simply used as a place holder on the commit list, and
 	 * provides a mechanism for attaching a "commit waiter" onto the
 	 * correct lwb (such that the waiter can be signalled upon
 	 * completion of that lwb). Thus, we don't process this itx's
 	 * log record if it's a commit itx (these itx's don't have log
 	 * records), and instead link the itx's waiter onto the lwb's
 	 * list of waiters.
 	 *
 	 * For more details, see the comment above zil_commit().
 	 */
 	if (lr->lrc_txtype == TX_COMMIT) {
 		zil_commit_waiter_link_lwb(itx->itx_private, lwb);
 		list_insert_tail(&lwb->lwb_itxs, itx);
 		return (lwb);
 	}
 
 	reclen = lr->lrc_reclen;
 	ASSERT3U(reclen, >=, sizeof (lr_t));
 	ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0));
 	dlen = zil_itx_data_size(itx);
 
 cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 * For WR_NEED_COPY optimize layout for minimal number of chunks.
 	 */
 	lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
 	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
 	    lwb_sp < zil_max_waste_space(zilog) &&
 	    (dlen % max_log_data == 0 ||
 	    lwb_sp < reclen + dlen % max_log_data))) {
 		list_insert_tail(ilwbs, lwb);
 		lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_OPENED);
 		if (lwb == NULL)
 			return (NULL);
 		lwb_sp = lwb->lwb_nmax - lwb->lwb_nused;
 	}
 
 	/*
 	 * There must be enough space in the log block to hold reclen.
 	 * For WR_COPIED, we need to fit the whole record in one block,
 	 * and reclen is the write record header size + the data size.
 	 * For WR_NEED_COPY, we can create multiple records, splitting
 	 * the data into multiple blocks, so we only need to fit one
 	 * word of data per block; in this case reclen is just the header
 	 * size (no data).
 	 */
 	ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp);
 
 	dnow = MIN(dlen, lwb_sp - reclen);
 	if (dlen > dnow) {
 		ASSERT3U(lr->lrc_txtype, ==, TX_WRITE);
 		ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY);
 		citx = zil_itx_clone(itx);
 		clr = &citx->itx_lr;
 		lr_write_t *clrw = (lr_write_t *)clr;
 		clrw->lr_length = dnow;
 		lrw->lr_offset += dnow;
 		lrw->lr_length -= dnow;
 		zilog->zl_cur_left -= dnow;
 	} else {
 		citx = itx;
 		clr = lr;
 	}
 
 	/*
 	 * We're actually making an entry, so update lrc_seq to be the
 	 * log record sequence number.  Note that this is generally not
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
 	clr->lrc_seq = ++zilog->zl_lr_seq;
 
 	lwb->lwb_nused += reclen + dnow;
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
 	zil_lwb_add_txg(lwb, lr->lrc_txg);
 	list_insert_tail(&lwb->lwb_itxs, citx);
 
 	dlen -= dnow;
 	if (dlen > 0)
 		goto cont;
 
 	if (lr->lrc_txtype == TX_WRITE &&
 	    lr->lrc_txg > spa_freeze_txg(zilog->zl_spa))
 		txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg);
 
 	return (lwb);
 }
 
 /*
  * Fill the actual transaction data into the lwb, following zil_lwb_assign().
  * Does not require locking.
  */
 static void
 zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx)
 {
 	lr_t *lr, *lrb;
 	lr_write_t *lrw, *lrwb;
 	char *lr_buf;
 	uint64_t dlen, reclen;
 
 	lr = &itx->itx_lr;
 	lrw = (lr_write_t *)lr;
 
 	if (lr->lrc_txtype == TX_COMMIT)
 		return;
 
 	reclen = lr->lrc_reclen;
 	dlen = zil_itx_data_size(itx);
 	ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled);
 
 	lr_buf = lwb->lwb_buf + lwb->lwb_nfilled;
 	memcpy(lr_buf, lr, reclen);
 	lrb = (lr_t *)lr_buf;		/* Like lr, but inside lwb. */
 	lrwb = (lr_write_t *)lrb;	/* Like lrw, but inside lwb. */
 
 	ZIL_STAT_BUMP(zilog, zil_itx_count);
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
 	 */
 	if (lr->lrc_txtype == TX_WRITE) {
 		if (itx->itx_wr_state == WR_COPIED) {
 			ZIL_STAT_BUMP(zilog, zil_itx_copied_count);
 			ZIL_STAT_INCR(zilog, zil_itx_copied_bytes,
 			    lrw->lr_length);
 		} else {
 			char *dbuf;
 			int error;
 
 			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
 				lrb->lrc_reclen += dlen;
 				ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count);
 				ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes,
 				    dlen);
 			} else {
 				ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT);
 				dbuf = NULL;
 				ZIL_STAT_BUMP(zilog, zil_itx_indirect_count);
 				ZIL_STAT_INCR(zilog, zil_itx_indirect_bytes,
 				    lrw->lr_length);
 				if (lwb->lwb_child_zio == NULL) {
 					lwb->lwb_child_zio = zio_root(
 					    zilog->zl_spa, NULL, NULL,
 					    ZIO_FLAG_CANFAIL);
 				}
 			}
 
 			/*
 			 * The "lwb_child_zio" we pass in will become a child of
 			 * "lwb_write_zio", when one is created, so one will be
 			 * a parent of any zio's created by the "zl_get_data".
 			 * This way "lwb_write_zio" will first wait for children
 			 * block pointers before own writing, and then for their
 			 * writing completion before the vdev cache flushing.
 			 */
 			error = zilog->zl_get_data(itx->itx_private,
 			    itx->itx_gen, lrwb, dbuf, lwb,
 			    lwb->lwb_child_zio);
 			if (dbuf != NULL && error == 0) {
 				/* Zero any padding bytes in the last block. */
 				memset((char *)dbuf + lrwb->lr_length, 0,
 				    dlen - lrwb->lr_length);
 			}
 
 			/*
 			 * Typically, the only return values we should see from
 			 * ->zl_get_data() are 0, EIO, ENOENT, EEXIST or
 			 *  EALREADY. However, it is also possible to see other
 			 *  error values such as ENOSPC or EINVAL from
 			 *  dmu_read() -> dnode_hold() -> dnode_hold_impl() or
 			 *  ENXIO as well as a multitude of others from the
 			 *  block layer through dmu_buf_hold() -> dbuf_read()
 			 *  -> zio_wait(), as well as through dmu_read() ->
 			 *  dnode_hold() -> dnode_hold_impl() -> dbuf_read() ->
 			 *  zio_wait(). When these errors happen, we can assume
 			 *  that neither an immediate write nor an indirect
 			 *  write occurred, so we need to fall back to
 			 *  txg_wait_synced(). This is unusual, so we print to
 			 *  dmesg whenever one of these errors occurs.
 			 */
 			switch (error) {
 			case 0:
 				break;
 			default:
 				cmn_err(CE_WARN, "zil_lwb_commit() received "
 				    "unexpected error %d from ->zl_get_data()"
 				    ". Falling back to txg_wait_synced().",
 				    error);
 				zfs_fallthrough;
 			case EIO:
 				txg_wait_synced(zilog->zl_dmu_pool,
 				    lr->lrc_txg);
 				zfs_fallthrough;
 			case ENOENT:
 				zfs_fallthrough;
 			case EEXIST:
 				zfs_fallthrough;
 			case EALREADY:
 				return;
 			}
 		}
 	}
 
 	lwb->lwb_nfilled += reclen + dlen;
 	ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused);
 	ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t)));
 }
 
 itx_t *
 zil_itx_create(uint64_t txtype, size_t olrsize)
 {
 	size_t itxsize, lrsize;
 	itx_t *itx;
 
 	ASSERT3U(olrsize, >=, sizeof (lr_t));
 	lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t);
 	ASSERT3U(lrsize, >=, olrsize);
 	itxsize = offsetof(itx_t, itx_lr) + lrsize;
 
 	itx = zio_data_buf_alloc(itxsize);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	memset((char *)&itx->itx_lr + olrsize, 0, lrsize - olrsize);
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	itx->itx_size = itxsize;
 
 	return (itx);
 }
 
 static itx_t *
 zil_itx_clone(itx_t *oitx)
 {
 	ASSERT3U(oitx->itx_size, >=, sizeof (itx_t));
 	ASSERT3U(oitx->itx_size, ==,
 	    offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen);
 
 	itx_t *itx = zio_data_buf_alloc(oitx->itx_size);
 	memcpy(itx, oitx, oitx->itx_size);
 	itx->itx_callback = NULL;
 	itx->itx_callback_data = NULL;
 	return (itx);
 }
 
 void
 zil_itx_destroy(itx_t *itx)
 {
 	ASSERT3U(itx->itx_size, >=, sizeof (itx_t));
 	ASSERT3U(itx->itx_lr.lrc_reclen, ==,
 	    itx->itx_size - offsetof(itx_t, itx_lr));
 	IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL);
 	IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 
 	if (itx->itx_callback != NULL)
 		itx->itx_callback(itx->itx_callback_data);
 
 	zio_data_buf_free(itx, itx->itx_size);
 }
 
 /*
  * Free up the sync and async itxs. The itxs_t has already been detached
  * so no locks are needed.
  */
 static void
 zil_itxg_clean(void *arg)
 {
 	itx_t *itx;
 	list_t *list;
 	avl_tree_t *t;
 	void *cookie;
 	itxs_t *itxs = arg;
 	itx_async_node_t *ian;
 
 	list = &itxs->i_sync_list;
 	while ((itx = list_remove_head(list)) != NULL) {
 		/*
 		 * In the general case, commit itxs will not be found
 		 * here, as they'll be committed to an lwb via
 		 * zil_lwb_assign(), and free'd in that function. Having
 		 * said that, it is still possible for commit itxs to be
 		 * found here, due to the following race:
 		 *
 		 *	- a thread calls zil_commit() which assigns the
 		 *	  commit itx to a per-txg i_sync_list
 		 *	- zil_itxg_clean() is called (e.g. via spa_sync())
 		 *	  while the waiter is still on the i_sync_list
 		 *
 		 * There's nothing to prevent syncing the txg while the
 		 * waiter is on the i_sync_list. This normally doesn't
 		 * happen because spa_sync() is slower than zil_commit(),
 		 * but if zil_commit() calls txg_wait_synced() (e.g.
 		 * because zil_create() or zil_commit_writer_stall() is
 		 * called) we will hit this case.
 		 */
 		if (itx->itx_lr.lrc_txtype == TX_COMMIT)
 			zil_commit_waiter_skip(itx->itx_private);
 
 		zil_itx_destroy(itx);
 	}
 
 	cookie = NULL;
 	t = &itxs->i_async_tree;
 	while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 		list = &ian->ia_list;
 		while ((itx = list_remove_head(list)) != NULL) {
 			/* commit itxs should never be on the async lists. */
 			ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 			zil_itx_destroy(itx);
 		}
 		list_destroy(list);
 		kmem_free(ian, sizeof (itx_async_node_t));
 	}
 	avl_destroy(t);
 
 	kmem_free(itxs, sizeof (itxs_t));
 }
 
 static int
 zil_aitx_compare(const void *x1, const void *x2)
 {
 	const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid;
 	const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid;
 
 	return (TREE_CMP(o1, o2));
 }
 
 /*
  * Remove all async itx with the given oid.
  */
 void
 zil_remove_async(zilog_t *zilog, uint64_t oid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 	list_t clean_list;
 	itx_t *itx;
 
 	ASSERT(oid != 0);
 	list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * Locate the object node and append its list.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		ian_search.ia_foid = oid;
 		ian = avl_find(t, &ian_search, &where);
 		if (ian != NULL)
 			list_move_tail(&clean_list, &ian->ia_list);
 		mutex_exit(&itxg->itxg_lock);
 	}
 	while ((itx = list_remove_head(&clean_list)) != NULL) {
 		/* commit itxs should never be on the async lists. */
 		ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT);
 		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
 
 void
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t txg;
 	itxg_t *itxg;
 	itxs_t *itxs, *clean = NULL;
 
 	/*
 	 * Ensure the data of a renamed file is committed before the rename.
 	 */
 	if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME)
 		zil_async_to_sync(zilog, itx->itx_oid);
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX)
 		txg = ZILTEST_TXG;
 	else
 		txg = dmu_tx_get_txg(tx);
 
 	itxg = &zilog->zl_itxg[txg & TXG_MASK];
 	mutex_enter(&itxg->itxg_lock);
 	itxs = itxg->itxg_itxs;
 	if (itxg->itxg_txg != txg) {
 		if (itxs != NULL) {
 			/*
 			 * The zil_clean callback hasn't got around to cleaning
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
 			zfs_dbgmsg("zil_itx_assign: missed itx cleanup for "
 			    "txg %llu", (u_longlong_t)itxg->itxg_txg);
 			clean = itxg->itxg_itxs;
 		}
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t),
 		    KM_SLEEP);
 
 		list_create(&itxs->i_sync_list, sizeof (itx_t),
 		    offsetof(itx_t, itx_node));
 		avl_create(&itxs->i_async_tree, zil_aitx_compare,
 		    sizeof (itx_async_node_t),
 		    offsetof(itx_async_node_t, ia_node));
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid =
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid);
 		itx_async_node_t *ian;
 		avl_index_t where;
 
 		ian = avl_find(t, &foid, &where);
 		if (ian == NULL) {
 			ian = kmem_alloc(sizeof (itx_async_node_t),
 			    KM_SLEEP);
 			list_create(&ian->ia_list, sizeof (itx_t),
 			    offsetof(itx_t, itx_node));
 			ian->ia_foid = foid;
 			avl_insert(t, ian, where);
 		}
 		list_insert_tail(&ian->ia_list, itx);
 	}
 
 	itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx);
 
 	/*
 	 * We don't want to dirty the ZIL using ZILTEST_TXG, because
 	 * zil_clean() will never be called using ZILTEST_TXG. Thus, we
 	 * need to be careful to always dirty the ZIL using the "real"
 	 * TXG (not itxg_txg) even when the SPA is frozen.
 	 */
 	zilog_dirty(zilog, dmu_tx_get_txg(tx));
 	mutex_exit(&itxg->itxg_lock);
 
 	/* Release the old itxs now we've dropped the lock */
 	if (clean != NULL)
 		zil_itxg_clean(clean);
 }
 
 /*
  * If there are any in-memory intent log transactions which have now been
  * synced then start up a taskq to free them. We should only do this after we
  * have written out the uberblocks (i.e. txg has been committed) so that
  * don't inadvertently clean out in-memory log records that would be required
  * by zil_commit().
  */
 void
 zil_clean(zilog_t *zilog, uint64_t synced_txg)
 {
 	itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK];
 	itxs_t *clean_me;
 
 	ASSERT3U(synced_txg, <, ZILTEST_TXG);
 
 	mutex_enter(&itxg->itxg_lock);
 	if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) {
 		mutex_exit(&itxg->itxg_lock);
 		return;
 	}
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT3U(itxg->itxg_txg, !=, 0);
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
 	mutex_exit(&itxg->itxg_lock);
 	/*
 	 * Preferably start a task queue to free up the old itxs but
 	 * if taskq_dispatch can't allocate resources to do that then
 	 * free it in-line. This should be rare. Note, using TQ_SLEEP
 	 * created a bad performance problem.
 	 */
 	ASSERT3P(zilog->zl_dmu_pool, !=, NULL);
 	ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL);
 	taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq,
 	    zil_itxg_clean, clean_me, TQ_NOSLEEP);
 	if (id == TASKQID_INVALID)
 		zil_itxg_clean(clean_me);
 }
 
 /*
  * This function will traverse the queue of itxs that need to be
  * committed, and move them onto the ZIL's zl_itx_commit_list.
  */
 static uint64_t
 zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg, wtxg = 0;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing. That's okay since we'll
 	 * only commit things in the future.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If we're adding itx records to the zl_itx_commit_list,
 		 * then the zil better be dirty in this "txg". We can assert
 		 * that here since we're holding the itxg_lock which will
 		 * prevent spa_sync from cleaning it. Once we add the itxs
 		 * to the zl_itx_commit_list we must commit it to disk even
 		 * if it's unnecessary (i.e. the txg was synced).
 		 */
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
 		itx_t *itx = NULL;
 		if (unlikely(zilog->zl_suspend > 0)) {
 			/*
 			 * ZIL was just suspended, but we lost the race.
 			 * Allow all earlier itxs to be committed, but ask
 			 * caller to do txg_wait_synced(txg) for any new.
 			 */
 			if (!list_is_empty(sync_list))
 				wtxg = MAX(wtxg, txg);
 		} else {
 			itx = list_head(sync_list);
 			list_move_tail(commit_list, sync_list);
 		}
 
 		mutex_exit(&itxg->itxg_lock);
 
 		while (itx != NULL) {
 			uint64_t s = zil_itx_full_size(itx);
 			zilog->zl_cur_size += s;
 			zilog->zl_cur_left += s;
 			s = zil_itx_record_size(itx);
 			zilog->zl_cur_max = MAX(zilog->zl_cur_max, s);
 			itx = list_next(commit_list, itx);
 		}
 	}
 	return (wtxg);
 }
 
 /*
  * Move the async itxs for a specified object to commit into sync lists.
  */
 void
 zil_async_to_sync(zilog_t *zilog, uint64_t foid)
 {
 	uint64_t otxg, txg;
 	itx_async_node_t *ian, ian_search;
 	avl_tree_t *t;
 	avl_index_t where;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
 	else
 		otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
 
 	/*
 	 * This is inherently racy, since there is nothing to prevent
 	 * the last synced txg from changing.
 	 */
 	for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
 		itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
 
 		mutex_enter(&itxg->itxg_lock);
 		if (itxg->itxg_txg != txg) {
 			mutex_exit(&itxg->itxg_lock);
 			continue;
 		}
 
 		/*
 		 * If a foid is specified then find that node and append its
 		 * list. Otherwise walk the tree appending all the lists
 		 * to the sync list. We add to the end rather than the
 		 * beginning to ensure the create has happened.
 		 */
 		t = &itxg->itxg_itxs->i_async_tree;
 		if (foid != 0) {
 			ian_search.ia_foid = foid;
 			ian = avl_find(t, &ian_search, &where);
 			if (ian != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 			}
 		} else {
 			void *cookie = NULL;
 
 			while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) {
 				list_move_tail(&itxg->itxg_itxs->i_sync_list,
 				    &ian->ia_list);
 				list_destroy(&ian->ia_list);
 				kmem_free(ian, sizeof (itx_async_node_t));
 			}
 		}
 		mutex_exit(&itxg->itxg_lock);
 	}
 }
 
 /*
  * This function will prune commit itxs that are at the head of the
  * commit list (it won't prune past the first non-commit itx), and
  * either: a) attach them to the last lwb that's still pending
  * completion, or b) skip them altogether.
  *
  * This is used as a performance optimization to prevent commit itxs
  * from generating new lwbs when it's unnecessary to do so.
  */
 static void
 zil_prune_commit_list(zilog_t *zilog)
 {
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		if (lrc->lrc_txtype != TX_COMMIT)
 			break;
 
 		mutex_enter(&zilog->zl_lock);
 
 		lwb_t *last_lwb = zilog->zl_last_lwb_opened;
 		if (last_lwb == NULL ||
 		    last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) {
 			/*
 			 * All of the itxs this waiter was waiting on
 			 * must have already completed (or there were
 			 * never any itx's for it to wait on), so it's
 			 * safe to skip this waiter and mark it done.
 			 */
 			zil_commit_waiter_skip(itx->itx_private);
 		} else {
 			zil_commit_waiter_link_lwb(itx->itx_private, last_lwb);
 		}
 
 		mutex_exit(&zilog->zl_lock);
 
 		list_remove(&zilog->zl_itx_commit_list, itx);
 		zil_itx_destroy(itx);
 	}
 
 	IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT);
 }
 
 static void
 zil_commit_writer_stall(zilog_t *zilog)
 {
 	/*
 	 * When zio_alloc_zil() fails to allocate the next lwb block on
 	 * disk, we must call txg_wait_synced() to ensure all of the
 	 * lwbs in the zilog's zl_lwb_list are synced and then freed (in
 	 * zil_sync()), such that any subsequent ZIL writer (i.e. a call
 	 * to zil_process_commit_list()) will have to call zil_create(),
 	 * and start a new ZIL chain.
 	 *
 	 * Since zil_alloc_zil() failed, the lwb that was previously
 	 * issued does not have a pointer to the "next" lwb on disk.
 	 * Thus, if another ZIL writer thread was to allocate the "next"
 	 * on-disk lwb, that block could be leaked in the event of a
 	 * crash (because the previous lwb on-disk would not point to
 	 * it).
 	 *
 	 * We must hold the zilog's zl_issuer_lock while we do this, to
 	 * ensure no new threads enter zil_process_commit_list() until
 	 * all lwb's in the zl_lwb_list have been synced and freed
 	 * (which is achieved via the txg_wait_synced() call).
 	 */
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 }
 
 static void
 zil_burst_done(zilog_t *zilog)
 {
 	if (!list_is_empty(&zilog->zl_itx_commit_list) ||
 	    zilog->zl_cur_size == 0)
 		return;
 
 	if (zilog->zl_parallel)
 		zilog->zl_parallel--;
 
 	uint_t r = (zilog->zl_prev_rotor + 1) & (ZIL_BURSTS - 1);
 	zilog->zl_prev_rotor = r;
 	zilog->zl_prev_opt[r] = zil_lwb_plan(zilog, zilog->zl_cur_size,
 	    &zilog->zl_prev_min[r]);
 
 	zilog->zl_cur_size = 0;
 	zilog->zl_cur_max = 0;
 	zilog->zl_cur_left = 0;
 }
 
 /*
  * This function will traverse the commit list, creating new lwbs as
  * needed, and committing the itxs from the commit list to these newly
  * created lwbs. Additionally, as a new lwb is created, the previous
  * lwb will be issued to the zio layer to be written to disk.
  */
 static void
 zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
 {
 	spa_t *spa = zilog->zl_spa;
 	list_t nolwb_itxs;
 	list_t nolwb_waiters;
 	lwb_t *lwb, *plwb;
 	itx_t *itx;
 
 	ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
 
 	/*
 	 * Return if there's nothing to commit before we dirty the fs by
 	 * calling zil_create().
 	 */
 	if (list_is_empty(&zilog->zl_itx_commit_list))
 		return;
 
 	list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb == NULL) {
 		lwb = zil_create(zilog);
 	} else {
 		/*
 		 * Activate SPA_FEATURE_ZILSAXATTR for the cases where ZIL will
 		 * have already been created (zl_lwb_list not empty).
 		 */
 		zil_commit_activate_saxattr_feature(zilog);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * If the lwb is still opened, it means the workload is really
 		 * multi-threaded and we won the chance of write aggregation.
 		 * If it is not opened yet, but previous lwb is still not
 		 * flushed, it still means the workload is multi-threaded, but
 		 * there was too much time between the commits to aggregate, so
 		 * we try aggregation next times, but without too much hopes.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED) {
 			zilog->zl_parallel = ZIL_BURSTS;
 		} else if ((plwb = list_prev(&zilog->zl_lwb_list, lwb))
 		    != NULL && plwb->lwb_state != LWB_STATE_FLUSH_DONE) {
 			zilog->zl_parallel = MAX(zilog->zl_parallel,
 			    ZIL_BURSTS / 2);
 		}
 	}
 
 	while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) {
 		lr_t *lrc = &itx->itx_lr;
 		uint64_t txg = lrc->lrc_txg;
 
 		ASSERT3U(txg, !=, 0);
 
 		if (lrc->lrc_txtype == TX_COMMIT) {
 			DTRACE_PROBE2(zil__process__commit__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		} else {
 			DTRACE_PROBE2(zil__process__normal__itx,
 			    zilog_t *, zilog, itx_t *, itx);
 		}
 
 		boolean_t synced = txg <= spa_last_synced_txg(spa);
 		boolean_t frozen = txg > spa_freeze_txg(spa);
 
 		/*
 		 * If the txg of this itx has already been synced out, then
 		 * we don't need to commit this itx to an lwb. This is
 		 * because the data of this itx will have already been
 		 * written to the main pool. This is inherently racy, and
 		 * it's still ok to commit an itx whose txg has already
 		 * been synced; this will result in a write that's
 		 * unnecessary, but will do no harm.
 		 *
 		 * With that said, we always want to commit TX_COMMIT itxs
 		 * to an lwb, regardless of whether or not that itx's txg
 		 * has been synced out. We do this to ensure any OPENED lwb
 		 * will always have at least one zil_commit_waiter_t linked
 		 * to the lwb.
 		 *
 		 * As a counter-example, if we skipped TX_COMMIT itx's
 		 * whose txg had already been synced, the following
 		 * situation could occur if we happened to be racing with
 		 * spa_sync:
 		 *
 		 * 1. We commit a non-TX_COMMIT itx to an lwb, where the
 		 *    itx's txg is 10 and the last synced txg is 9.
 		 * 2. spa_sync finishes syncing out txg 10.
 		 * 3. We move to the next itx in the list, it's a TX_COMMIT
 		 *    whose txg is 10, so we skip it rather than committing
 		 *    it to the lwb used in (1).
 		 *
 		 * If the itx that is skipped in (3) is the last TX_COMMIT
 		 * itx in the commit list, than it's possible for the lwb
 		 * used in (1) to remain in the OPENED state indefinitely.
 		 *
 		 * To prevent the above scenario from occurring, ensuring
 		 * that once an lwb is OPENED it will transition to ISSUED
 		 * and eventually DONE, we always commit TX_COMMIT itx's to
 		 * an lwb here, even if that itx's txg has already been
 		 * synced.
 		 *
 		 * Finally, if the pool is frozen, we _always_ commit the
 		 * itx.  The point of freezing the pool is to prevent data
 		 * from being written to the main pool via spa_sync, and
 		 * instead rely solely on the ZIL to persistently store the
 		 * data; i.e.  when the pool is frozen, the last synced txg
 		 * value can't be trusted.
 		 */
 		if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) {
 			if (lwb != NULL) {
 				lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs);
 				if (lwb == NULL) {
 					list_insert_tail(&nolwb_itxs, itx);
 				} else if ((zcw->zcw_lwb != NULL &&
 				    zcw->zcw_lwb != lwb) || zcw->zcw_done) {
 					/*
 					 * Our lwb is done, leave the rest of
 					 * itx list to somebody else who care.
 					 */
 					zilog->zl_parallel = ZIL_BURSTS;
 					zilog->zl_cur_left -=
 					    zil_itx_full_size(itx);
 					break;
 				}
 			} else {
 				if (lrc->lrc_txtype == TX_COMMIT) {
 					zil_commit_waiter_link_nolwb(
 					    itx->itx_private, &nolwb_waiters);
 				}
 				list_insert_tail(&nolwb_itxs, itx);
 			}
 			zilog->zl_cur_left -= zil_itx_full_size(itx);
 		} else {
 			ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT);
 			zilog->zl_cur_left -= zil_itx_full_size(itx);
 			zil_itx_destroy(itx);
 		}
 	}
 
 	if (lwb == NULL) {
 		/*
 		 * This indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this happens, we must stall
 		 * the ZIL write pipeline; see the comment within
 		 * zil_commit_writer_stall() for more details.
 		 */
 		while ((lwb = list_remove_head(ilwbs)) != NULL)
 			zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 
 		/*
 		 * Additionally, we have to signal and mark the "nolwb"
 		 * waiters as "done" here, since without an lwb, we
 		 * can't do this via zil_lwb_flush_vdevs_done() like
 		 * normal.
 		 */
 		zil_commit_waiter_t *zcw;
 		while ((zcw = list_remove_head(&nolwb_waiters)) != NULL)
 			zil_commit_waiter_skip(zcw);
 
 		/*
 		 * And finally, we have to destroy the itx's that
 		 * couldn't be committed to an lwb; this will also call
 		 * the itx's callback if one exists for the itx.
 		 */
 		while ((itx = list_remove_head(&nolwb_itxs)) != NULL)
 			zil_itx_destroy(itx);
 	} else {
 		ASSERT(list_is_empty(&nolwb_waiters));
 		ASSERT3P(lwb, !=, NULL);
 		ASSERT(lwb->lwb_state == LWB_STATE_NEW ||
 		    lwb->lwb_state == LWB_STATE_OPENED);
 
 		/*
 		 * At this point, the ZIL block pointed at by the "lwb"
 		 * variable is in "new" or "opened" state.
 		 *
 		 * If it's "new", then no itxs have been committed to it, so
 		 * there's no point in issuing its zio (i.e. it's "empty").
 		 *
 		 * If it's "opened", then it contains one or more itxs that
 		 * eventually need to be committed to stable storage. In
 		 * this case we intentionally do not issue the lwb's zio
 		 * to disk yet, and instead rely on one of the following
 		 * two mechanisms for issuing the zio:
 		 *
 		 * 1. Ideally, there will be more ZIL activity occurring on
 		 * the system, such that this function will be immediately
 		 * called again by different thread and this lwb will be
 		 * closed by zil_lwb_assign().  This way, the lwb will be
 		 * "full" when it is issued to disk, and we'll make use of
 		 * the lwb's size the best we can.
 		 *
 		 * 2. If there isn't sufficient ZIL activity occurring on
 		 * the system, zil_commit_waiter() will close it and issue
 		 * the zio.  If this occurs, the lwb is not guaranteed
 		 * to be "full" by the time its zio is issued, and means
 		 * the size of the lwb was "too large" given the amount
 		 * of ZIL activity occurring on the system at that time.
 		 *
 		 * We do this for a couple of reasons:
 		 *
 		 * 1. To try and reduce the number of IOPs needed to
 		 * write the same number of itxs. If an lwb has space
 		 * available in its buffer for more itxs, and more itxs
 		 * will be committed relatively soon (relative to the
 		 * latency of performing a write), then it's beneficial
 		 * to wait for these "next" itxs. This way, more itxs
 		 * can be committed to stable storage with fewer writes.
 		 *
 		 * 2. To try and use the largest lwb block size that the
 		 * incoming rate of itxs can support. Again, this is to
 		 * try and pack as many itxs into as few lwbs as
 		 * possible, without significantly impacting the latency
 		 * of each individual itx.
 		 */
 		if (lwb->lwb_state == LWB_STATE_OPENED && !zilog->zl_parallel) {
 			zil_burst_done(zilog);
 			list_insert_tail(ilwbs, lwb);
 			lwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 			if (lwb == NULL) {
 				while ((lwb = list_remove_head(ilwbs)) != NULL)
 					zil_lwb_write_issue(zilog, lwb);
 				zil_commit_writer_stall(zilog);
 			}
 		}
 	}
 }
 
 /*
  * This function is responsible for ensuring the passed in commit waiter
  * (and associated commit itx) is committed to an lwb. If the waiter is
  * not already committed to an lwb, all itxs in the zilog's queue of
  * itxs will be processed. The assumption is the passed in waiter's
  * commit itx will found in the queue just like the other non-commit
  * itxs, such that when the entire queue is processed, the waiter will
  * have been committed to an lwb.
  *
  * The lwb associated with the passed in waiter is not guaranteed to
  * have been issued by the time this function completes. If the lwb is
  * not issued, we rely on future calls to zil_commit_writer() to issue
  * the lwb, or the timeout mechanism found in zil_commit_waiter().
  */
 static uint64_t
 zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	list_t ilwbs;
 	lwb_t *lwb;
 	uint64_t wtxg = 0;
 
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node));
 	mutex_enter(&zilog->zl_issuer_lock);
 
 	if (zcw->zcw_lwb != NULL || zcw->zcw_done) {
 		/*
 		 * It's possible that, while we were waiting to acquire
 		 * the "zl_issuer_lock", another thread committed this
 		 * waiter to an lwb. If that occurs, we bail out early,
 		 * without processing any of the zilog's queue of itxs.
 		 *
 		 * On certain workloads and system configurations, the
 		 * "zl_issuer_lock" can become highly contended. In an
 		 * attempt to reduce this contention, we immediately drop
 		 * the lock if the waiter has already been processed.
 		 *
 		 * We've measured this optimization to reduce CPU spent
 		 * contending on this lock by up to 5%, using a system
 		 * with 32 CPUs, low latency storage (~50 usec writes),
 		 * and 1024 threads performing sync writes.
 		 */
 		goto out;
 	}
 
 	ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
 
 	wtxg = zil_get_commit_list(zilog);
 	zil_prune_commit_list(zilog);
 	zil_process_commit_list(zilog, zcw, &ilwbs);
 
 out:
 	mutex_exit(&zilog->zl_issuer_lock);
 	while ((lwb = list_remove_head(&ilwbs)) != NULL)
 		zil_lwb_write_issue(zilog, lwb);
 	list_destroy(&ilwbs);
 	return (wtxg);
 }
 
 static void
 zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 	ASSERT3B(zcw->zcw_done, ==, B_FALSE);
 
 	lwb_t *lwb = zcw->zcw_lwb;
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3S(lwb->lwb_state, !=, LWB_STATE_NEW);
 
 	/*
 	 * If the lwb has already been issued by another thread, we can
 	 * immediately return since there's no work to be done (the
 	 * point of this function is to issue the lwb). Additionally, we
 	 * do this prior to acquiring the zl_issuer_lock, to avoid
 	 * acquiring it when it's not necessary to do so.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED)
 		return;
 
 	/*
 	 * In order to call zil_lwb_write_close() we must hold the
 	 * zilog's "zl_issuer_lock". We can't simply acquire that lock,
 	 * since we're already holding the commit waiter's "zcw_lock",
 	 * and those two locks are acquired in the opposite order
 	 * elsewhere.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 	mutex_enter(&zilog->zl_issuer_lock);
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * Since we just dropped and re-acquired the commit waiter's
 	 * lock, we have to re-check to see if the waiter was marked
 	 * "done" during that process. If the waiter was marked "done",
 	 * the "lwb" pointer is no longer valid (it can be free'd after
 	 * the waiter is marked "done"), so without this check we could
 	 * wind up with a use-after-free error below.
 	 */
 	if (zcw->zcw_done) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	ASSERT3P(lwb, ==, zcw->zcw_lwb);
 
 	/*
 	 * We've already checked this above, but since we hadn't acquired
 	 * the zilog's zl_issuer_lock, we have to perform this check a
 	 * second time while holding the lock.
 	 *
 	 * We don't need to hold the zl_lock since the lwb cannot transition
 	 * from OPENED to CLOSED while we hold the zl_issuer_lock. The lwb
 	 * _can_ transition from CLOSED to DONE, but it's OK to race with
 	 * that transition since we treat the lwb the same, whether it's in
 	 * the CLOSED, ISSUED or DONE states.
 	 *
 	 * The important thing, is we treat the lwb differently depending on
 	 * if it's OPENED or CLOSED, and block any other threads that might
 	 * attempt to close/issue this lwb. For that reason we hold the
 	 * zl_issuer_lock when checking the lwb_state; we must not call
 	 * zil_lwb_write_close() if the lwb had already been closed/issued.
 	 *
 	 * See the comment above the lwb_state_t structure definition for
 	 * more details on the lwb states, and locking requirements.
 	 */
 	if (lwb->lwb_state != LWB_STATE_OPENED) {
 		mutex_exit(&zilog->zl_issuer_lock);
 		return;
 	}
 
 	/*
 	 * We do not need zcw_lock once we hold zl_issuer_lock and know lwb
 	 * is still open.  But we have to drop it to avoid a deadlock in case
 	 * callback of zio issued by zil_lwb_write_issue() try to get it,
 	 * while zil_lwb_write_issue() is blocked on attempt to issue next
 	 * lwb it found in LWB_STATE_READY state.
 	 */
 	mutex_exit(&zcw->zcw_lock);
 
 	/*
 	 * As described in the comments above zil_commit_waiter() and
 	 * zil_process_commit_list(), we need to issue this lwb's zio
 	 * since we've reached the commit waiter's timeout and it still
 	 * hasn't been issued.
 	 */
 	zil_burst_done(zilog);
 	lwb_t *nlwb = zil_lwb_write_close(zilog, lwb, LWB_STATE_NEW);
 
 	ASSERT3S(lwb->lwb_state, ==, LWB_STATE_CLOSED);
 
 	if (nlwb == NULL) {
 		/*
 		 * When zil_lwb_write_close() returns NULL, this
 		 * indicates zio_alloc_zil() failed to allocate the
 		 * "next" lwb on-disk. When this occurs, the ZIL write
 		 * pipeline must be stalled; see the comment within the
 		 * zil_commit_writer_stall() function for more details.
 		 */
 		zil_lwb_write_issue(zilog, lwb);
 		zil_commit_writer_stall(zilog);
 		mutex_exit(&zilog->zl_issuer_lock);
 	} else {
 		mutex_exit(&zilog->zl_issuer_lock);
 		zil_lwb_write_issue(zilog, lwb);
 	}
 	mutex_enter(&zcw->zcw_lock);
 }
 
 /*
  * This function is responsible for performing the following two tasks:
  *
  * 1. its primary responsibility is to block until the given "commit
  *    waiter" is considered "done".
  *
  * 2. its secondary responsibility is to issue the zio for the lwb that
  *    the given "commit waiter" is waiting on, if this function has
  *    waited "long enough" and the lwb is still in the "open" state.
  *
  * Given a sufficient amount of itxs being generated and written using
  * the ZIL, the lwb's zio will be issued via the zil_lwb_assign()
  * function. If this does not occur, this secondary responsibility will
  * ensure the lwb is issued even if there is not other synchronous
  * activity on the system.
  *
  * For more details, see zil_process_commit_list(); more specifically,
  * the comment at the bottom of that function.
  */
 static void
 zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	ASSERT(!MUTEX_HELD(&zilog->zl_lock));
 	ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock));
 	ASSERT(spa_writeable(zilog->zl_spa));
 
 	mutex_enter(&zcw->zcw_lock);
 
 	/*
 	 * The timeout is scaled based on the lwb latency to avoid
 	 * significantly impacting the latency of each individual itx.
 	 * For more details, see the comment at the bottom of the
 	 * zil_process_commit_list() function.
 	 */
 	int pct = MAX(zfs_commit_timeout_pct, 1);
 	hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100;
 	hrtime_t wakeup = gethrtime() + sleep;
 	boolean_t timedout = B_FALSE;
 
 	while (!zcw->zcw_done) {
 		ASSERT(MUTEX_HELD(&zcw->zcw_lock));
 
 		lwb_t *lwb = zcw->zcw_lwb;
 
 		/*
 		 * Usually, the waiter will have a non-NULL lwb field here,
 		 * but it's possible for it to be NULL as a result of
 		 * zil_commit() racing with spa_sync().
 		 *
 		 * When zil_clean() is called, it's possible for the itxg
 		 * list (which may be cleaned via a taskq) to contain
 		 * commit itxs. When this occurs, the commit waiters linked
 		 * off of these commit itxs will not be committed to an
 		 * lwb.  Additionally, these commit waiters will not be
 		 * marked done until zil_commit_waiter_skip() is called via
 		 * zil_itxg_clean().
 		 *
 		 * Thus, it's possible for this commit waiter (i.e. the
 		 * "zcw" variable) to be found in this "in between" state;
 		 * where it's "zcw_lwb" field is NULL, and it hasn't yet
 		 * been skipped, so it's "zcw_done" field is still B_FALSE.
 		 */
 		IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_NEW);
 
 		if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) {
 			ASSERT3B(timedout, ==, B_FALSE);
 
 			/*
 			 * If the lwb hasn't been issued yet, then we
 			 * need to wait with a timeout, in case this
 			 * function needs to issue the lwb after the
 			 * timeout is reached; responsibility (2) from
 			 * the comment above this function.
 			 */
 			int rc = cv_timedwait_hires(&zcw->zcw_cv,
 			    &zcw->zcw_lock, wakeup, USEC2NSEC(1),
 			    CALLOUT_FLAG_ABSOLUTE);
 
 			if (rc != -1 || zcw->zcw_done)
 				continue;
 
 			timedout = B_TRUE;
 			zil_commit_waiter_timeout(zilog, zcw);
 
 			if (!zcw->zcw_done) {
 				/*
 				 * If the commit waiter has already been
 				 * marked "done", it's possible for the
 				 * waiter's lwb structure to have already
 				 * been freed.  Thus, we can only reliably
 				 * make these assertions if the waiter
 				 * isn't done.
 				 */
 				ASSERT3P(lwb, ==, zcw->zcw_lwb);
 				ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
 			}
 		} else {
 			/*
 			 * If the lwb isn't open, then it must have already
 			 * been issued. In that case, there's no need to
 			 * use a timeout when waiting for the lwb to
 			 * complete.
 			 *
 			 * Additionally, if the lwb is NULL, the waiter
 			 * will soon be signaled and marked done via
 			 * zil_clean() and zil_itxg_clean(), so no timeout
 			 * is required.
 			 */
 
 			IMPLY(lwb != NULL,
 			    lwb->lwb_state == LWB_STATE_CLOSED ||
 			    lwb->lwb_state == LWB_STATE_READY ||
 			    lwb->lwb_state == LWB_STATE_ISSUED ||
 			    lwb->lwb_state == LWB_STATE_WRITE_DONE ||
 			    lwb->lwb_state == LWB_STATE_FLUSH_DONE);
 			cv_wait(&zcw->zcw_cv, &zcw->zcw_lock);
 		}
 	}
 
 	mutex_exit(&zcw->zcw_lock);
 }
 
 static zil_commit_waiter_t *
 zil_alloc_commit_waiter(void)
 {
 	zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP);
 
 	cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_link_init(&zcw->zcw_node);
 	zcw->zcw_lwb = NULL;
 	zcw->zcw_done = B_FALSE;
 	zcw->zcw_zio_error = 0;
 
 	return (zcw);
 }
 
 static void
 zil_free_commit_waiter(zil_commit_waiter_t *zcw)
 {
 	ASSERT(!list_link_active(&zcw->zcw_node));
 	ASSERT3P(zcw->zcw_lwb, ==, NULL);
 	ASSERT3B(zcw->zcw_done, ==, B_TRUE);
 	mutex_destroy(&zcw->zcw_lock);
 	cv_destroy(&zcw->zcw_cv);
 	kmem_cache_free(zil_zcw_cache, zcw);
 }
 
 /*
  * This function is used to create a TX_COMMIT itx and assign it. This
  * way, it will be linked into the ZIL's list of synchronous itxs, and
  * then later committed to an lwb (or skipped) when
  * zil_process_commit_list() is called.
  */
 static void
 zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw)
 {
 	dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
 
 	/*
 	 * Since we are not going to create any new dirty data, and we
 	 * can even help with clearing the existing dirty data, we
 	 * should not be subject to the dirty data based delays. We
 	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
 	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
 
 	itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t));
 	itx->itx_sync = B_TRUE;
 	itx->itx_private = zcw;
 
 	zil_itx_assign(zilog, itx, tx);
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Commit ZFS Intent Log transactions (itxs) to stable storage.
  *
  * When writing ZIL transactions to the on-disk representation of the
  * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple
  * itxs can be committed to a single lwb. Once a lwb is written and
  * committed to stable storage (i.e. the lwb is written, and vdevs have
  * been flushed), each itx that was committed to that lwb is also
  * considered to be committed to stable storage.
  *
  * When an itx is committed to an lwb, the log record (lr_t) contained
  * by the itx is copied into the lwb's zio buffer, and once this buffer
  * is written to disk, it becomes an on-disk ZIL block.
  *
  * As itxs are generated, they're inserted into the ZIL's queue of
  * uncommitted itxs. The semantics of zil_commit() are such that it will
  * block until all itxs that were in the queue when it was called, are
  * committed to stable storage.
  *
  * If "foid" is zero, this means all "synchronous" and "asynchronous"
  * itxs, for all objects in the dataset, will be committed to stable
  * storage prior to zil_commit() returning. If "foid" is non-zero, all
  * "synchronous" itxs for all objects, but only "asynchronous" itxs
  * that correspond to the foid passed in, will be committed to stable
  * storage prior to zil_commit() returning.
  *
  * Generally speaking, when zil_commit() is called, the consumer doesn't
  * actually care about _all_ of the uncommitted itxs. Instead, they're
  * simply trying to waiting for a specific itx to be committed to disk,
  * but the interface(s) for interacting with the ZIL don't allow such
  * fine-grained communication. A better interface would allow a consumer
  * to create and assign an itx, and then pass a reference to this itx to
  * zil_commit(); such that zil_commit() would return as soon as that
  * specific itx was committed to disk (instead of waiting for _all_
  * itxs to be committed).
  *
  * When a thread calls zil_commit() a special "commit itx" will be
  * generated, along with a corresponding "waiter" for this commit itx.
  * zil_commit() will wait on this waiter's CV, such that when the waiter
  * is marked done, and signaled, zil_commit() will return.
  *
  * This commit itx is inserted into the queue of uncommitted itxs. This
  * provides an easy mechanism for determining which itxs were in the
  * queue prior to zil_commit() having been called, and which itxs were
  * added after zil_commit() was called.
  *
  * The commit itx is special; it doesn't have any on-disk representation.
  * When a commit itx is "committed" to an lwb, the waiter associated
  * with it is linked onto the lwb's list of waiters. Then, when that lwb
  * completes, each waiter on the lwb's list is marked done and signaled
  * -- allowing the thread waiting on the waiter to return from zil_commit().
  *
  * It's important to point out a few critical factors that allow us
  * to make use of the commit itxs, commit waiters, per-lwb lists of
  * commit waiters, and zio completion callbacks like we're doing:
  *
  *   1. The list of waiters for each lwb is traversed, and each commit
  *      waiter is marked "done" and signaled, in the zio completion
  *      callback of the lwb's zio[*].
  *
  *      * Actually, the waiters are signaled in the zio completion
  *        callback of the root zio for the DKIOCFLUSHWRITECACHE commands
  *        that are sent to the vdevs upon completion of the lwb zio.
  *
  *   2. When the itxs are inserted into the ZIL's queue of uncommitted
  *      itxs, the order in which they are inserted is preserved[*]; as
  *      itxs are added to the queue, they are added to the tail of
  *      in-memory linked lists.
  *
  *      When committing the itxs to lwbs (to be written to disk), they
  *      are committed in the same order in which the itxs were added to
  *      the uncommitted queue's linked list(s); i.e. the linked list of
  *      itxs to commit is traversed from head to tail, and each itx is
  *      committed to an lwb in that order.
  *
  *      * To clarify:
  *
  *        - the order of "sync" itxs is preserved w.r.t. other
  *          "sync" itxs, regardless of the corresponding objects.
  *        - the order of "async" itxs is preserved w.r.t. other
  *          "async" itxs corresponding to the same object.
  *        - the order of "async" itxs is *not* preserved w.r.t. other
  *          "async" itxs corresponding to different objects.
  *        - the order of "sync" itxs w.r.t. "async" itxs (or vice
  *          versa) is *not* preserved, even for itxs that correspond
  *          to the same object.
  *
  *      For more details, see: zil_itx_assign(), zil_async_to_sync(),
  *      zil_get_commit_list(), and zil_process_commit_list().
  *
  *   3. The lwbs represent a linked list of blocks on disk. Thus, any
  *      lwb cannot be considered committed to stable storage, until its
  *      "previous" lwb is also committed to stable storage. This fact,
  *      coupled with the fact described above, means that itxs are
  *      committed in (roughly) the order in which they were generated.
  *      This is essential because itxs are dependent on prior itxs.
  *      Thus, we *must not* deem an itx as being committed to stable
  *      storage, until *all* prior itxs have also been committed to
  *      stable storage.
  *
  *      To enforce this ordering of lwb zio's, while still leveraging as
  *      much of the underlying storage performance as possible, we rely
  *      on two fundamental concepts:
  *
  *          1. The creation and issuance of lwb zio's is protected by
  *             the zilog's "zl_issuer_lock", which ensures only a single
  *             thread is creating and/or issuing lwb's at a time
  *          2. The "previous" lwb is a child of the "current" lwb
  *             (leveraging the zio parent-child dependency graph)
  *
  *      By relying on this parent-child zio relationship, we can have
  *      many lwb zio's concurrently issued to the underlying storage,
  *      but the order in which they complete will be the same order in
  *      which they were created.
  */
 void
 zil_commit(zilog_t *zilog, uint64_t foid)
 {
 	/*
 	 * We should never attempt to call zil_commit on a snapshot for
 	 * a couple of reasons:
 	 *
 	 * 1. A snapshot may never be modified, thus it cannot have any
 	 *    in-flight itxs that would have modified the dataset.
 	 *
 	 * 2. By design, when zil_commit() is called, a commit itx will
 	 *    be assigned to this zilog; as a result, the zilog will be
 	 *    dirtied. We must not dirty the zilog of a snapshot; there's
 	 *    checks in the code that enforce this invariant, and will
 	 *    cause a panic if it's not upheld.
 	 */
 	ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE);
 
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return;
 
 	if (!spa_writeable(zilog->zl_spa)) {
 		/*
 		 * If the SPA is not writable, there should never be any
 		 * pending itxs waiting to be committed to disk. If that
 		 * weren't true, we'd skip writing those itxs out, and
 		 * would break the semantics of zil_commit(); thus, we're
 		 * verifying that truth before we return to the caller.
 		 */
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL);
 		return;
 	}
 
 	/*
 	 * If the ZIL is suspended, we don't want to dirty it by calling
 	 * zil_commit_itx_assign() below, nor can we write out
 	 * lwbs like would be done in zil_commit_write(). Thus, we
 	 * simply rely on txg_wait_synced() to maintain the necessary
 	 * semantics, and avoid calling those functions altogether.
 	 */
 	if (zilog->zl_suspend > 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 		return;
 	}
 
 	zil_commit_impl(zilog, foid);
 }
 
 void
 zil_commit_impl(zilog_t *zilog, uint64_t foid)
 {
 	ZIL_STAT_BUMP(zilog, zil_commit_count);
 
 	/*
 	 * Move the "async" itxs for the specified foid to the "sync"
 	 * queues, such that they will be later committed (or skipped)
 	 * to an lwb when zil_process_commit_list() is called.
 	 *
 	 * Since these "async" itxs must be committed prior to this
 	 * call to zil_commit returning, we must perform this operation
 	 * before we call zil_commit_itx_assign().
 	 */
 	zil_async_to_sync(zilog, foid);
 
 	/*
 	 * We allocate a new "waiter" structure which will initially be
 	 * linked to the commit itx using the itx's "itx_private" field.
 	 * Since the commit itx doesn't represent any on-disk state,
 	 * when it's committed to an lwb, rather than copying the its
 	 * lr_t into the lwb's buffer, the commit itx's "waiter" will be
 	 * added to the lwb's list of waiters. Then, when the lwb is
 	 * committed to stable storage, each waiter in the lwb's list of
 	 * waiters will be marked "done", and signalled.
 	 *
 	 * We must create the waiter and assign the commit itx prior to
 	 * calling zil_commit_writer(), or else our specific commit itx
 	 * is not guaranteed to be committed to an lwb prior to calling
 	 * zil_commit_waiter().
 	 */
 	zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
 	zil_commit_itx_assign(zilog, zcw);
 
 	uint64_t wtxg = zil_commit_writer(zilog, zcw);
 	zil_commit_waiter(zilog, zcw);
 
 	if (zcw->zcw_zio_error != 0) {
 		/*
 		 * If there was an error writing out the ZIL blocks that
 		 * this thread is waiting on, then we fallback to
 		 * relying on spa_sync() to write out the data this
 		 * thread is waiting on. Obviously this has performance
 		 * implications, but the expectation is for this to be
 		 * an exceptional case, and shouldn't occur often.
 		 */
 		DTRACE_PROBE2(zil__commit__io__error,
 		    zilog_t *, zilog, zil_commit_waiter_t *, zcw);
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
 	} else if (wtxg != 0) {
 		txg_wait_synced(zilog->zl_dmu_pool, wtxg);
 	}
 
 	zil_free_commit_waiter(zcw);
 }
 
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
 void
 zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 {
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
 	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
 	/*
 	 * We don't zero out zl_destroy_txg, so make sure we don't try
 	 * to destroy it twice.
 	 */
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	zil_lwb_flush_wait_all(zilog, txg);
 
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
 	if (*replayed_seq != 0) {
 		ASSERT(zh->zh_replay_seq < *replayed_seq);
 		zh->zh_replay_seq = *replayed_seq;
 		*replayed_seq = 0;
 	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 		dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
 
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 		memset(zh, 0, sizeof (zil_header_t));
 		memset(zilog->zl_replayed_seq, 0,
 		    sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
 			 * If this block was part of log chain that couldn't
 			 * be claimed because a device was missing during
 			 * zil_claim(), but that device later returns,
 			 * then this block could erroneously appear valid.
 			 * To guard against this, assign a new GUID to the new
 			 * log chain so it doesn't matter what blk points to.
 			 */
 			zil_init_log_chain(zilog, &blk);
 			zh->zh_log = blk;
 		} else {
 			/*
 			 * A destroyed ZIL chain can't contain any TX_SETSAXATTR
 			 * records. So, deactivate the feature for this dataset.
 			 * We activate it again when we start a new ZIL chain.
 			 */
 			if (dsl_dataset_feature_is_active(ds,
 			    SPA_FEATURE_ZILSAXATTR))
 				dsl_dataset_deactivate_feature(ds,
 				    SPA_FEATURE_ZILSAXATTR, tx);
 		}
 	}
 
 	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_state != LWB_STATE_FLUSH_DONE ||
 		    lwb->lwb_alloc_txg > txg || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
 		if (!BP_IS_HOLE(&lwb->lwb_blk))
 			zio_free(spa, txg, &lwb->lwb_blk);
 		zil_free_lwb(zilog, lwb);
 
 		/*
 		 * If we don't have anything left in the lwb list then
 		 * we've had an allocation failure and we need to zero
 		 * out the zil_header blkptr so that we don't end
 		 * up freeing the same block twice.
 		 */
 		if (list_is_empty(&zilog->zl_lwb_list))
 			BP_ZERO(&zh->zh_log);
 	}
 
 	mutex_exit(&zilog->zl_lock);
 }
 
 static int
 zil_lwb_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	lwb_t *lwb = vbuf;
 	list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node));
 	list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t),
 	    offsetof(zil_commit_waiter_t, zcw_node));
 	avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare,
 	    sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node));
 	mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL);
 	return (0);
 }
 
 static void
 zil_lwb_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	lwb_t *lwb = vbuf;
 	mutex_destroy(&lwb->lwb_vdev_lock);
 	avl_destroy(&lwb->lwb_vdev_tree);
 	list_destroy(&lwb->lwb_waiters);
 	list_destroy(&lwb->lwb_itxs);
 }
 
 void
 zil_init(void)
 {
 	zil_lwb_cache = kmem_cache_create("zil_lwb_cache",
 	    sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0);
 
 	zil_zcw_cache = kmem_cache_create("zil_zcw_cache",
 	    sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	zil_sums_init(&zil_sums_global);
 	zil_kstats_global = kstat_create("zfs", 0, "zil", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zil_kstats_global != NULL) {
 		zil_kstats_global->ks_data = &zil_stats;
 		zil_kstats_global->ks_update = zil_kstats_global_update;
 		zil_kstats_global->ks_private = NULL;
 		kstat_install(zil_kstats_global);
 	}
 }
 
 void
 zil_fini(void)
 {
 	kmem_cache_destroy(zil_zcw_cache);
 	kmem_cache_destroy(zil_lwb_cache);
 
 	if (zil_kstats_global != NULL) {
 		kstat_delete(zil_kstats_global);
 		zil_kstats_global = NULL;
 	}
 
 	zil_sums_fini(&zil_sums_global);
 }
 
 void
 zil_set_sync(zilog_t *zilog, uint64_t sync)
 {
 	zilog->zl_sync = sync;
 }
 
 void
 zil_set_logbias(zilog_t *zilog, uint64_t logbias)
 {
 	zilog->zl_logbias = logbias;
 }
 
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
 	zilog_t *zilog;
 
 	zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP);
 
 	zilog->zl_header = zh_phys;
 	zilog->zl_os = os;
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
 	zilog->zl_logbias = dmu_objset_logbias(os);
 	zilog->zl_sync = dmu_objset_syncprop(os);
 	zilog->zl_dirty_max_txg = 0;
 	zilog->zl_last_lwb_opened = NULL;
 	zilog->zl_last_lwb_latency = 0;
 	zilog->zl_max_block_size = MIN(MAX(P2ALIGN_TYPED(zil_maxblocksize,
 	    ZIL_MIN_BLKSZ, uint64_t), ZIL_MIN_BLKSZ),
 	    spa_maxblocksize(dmu_objset_spa(os)));
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zilog->zl_lwb_io_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL,
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	list_create(&zilog->zl_lwb_list, sizeof (lwb_t),
 	    offsetof(lwb_t, lwb_node));
 
 	list_create(&zilog->zl_itx_commit_list, sizeof (itx_t),
 	    offsetof(itx_t, itx_node));
 
 	cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL);
 	cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int i = 0; i < ZIL_BURSTS; i++) {
 		zilog->zl_prev_opt[i] = zilog->zl_max_block_size -
 		    sizeof (zil_chain_t);
 	}
 
 	return (zilog);
 }
 
 void
 zil_free(zilog_t *zilog)
 {
 	int i;
 
 	zilog->zl_stop_sync = 1;
 
 	ASSERT0(zilog->zl_suspend);
 	ASSERT0(zilog->zl_suspending);
 
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 	list_destroy(&zilog->zl_lwb_list);
 
 	ASSERT(list_is_empty(&zilog->zl_itx_commit_list));
 	list_destroy(&zilog->zl_itx_commit_list);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		/*
 		 * It's possible for an itx to be generated that doesn't dirty
 		 * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean()
 		 * callback to remove the entry. We remove those here.
 		 *
 		 * Also free up the ziltest itxs.
 		 */
 		if (zilog->zl_itxg[i].itxg_itxs)
 			zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs);
 		mutex_destroy(&zilog->zl_itxg[i].itxg_lock);
 	}
 
 	mutex_destroy(&zilog->zl_issuer_lock);
 	mutex_destroy(&zilog->zl_lock);
 	mutex_destroy(&zilog->zl_lwb_io_lock);
 
 	cv_destroy(&zilog->zl_cv_suspend);
 	cv_destroy(&zilog->zl_lwb_io_cv);
 
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
 /*
  * Open an intent log.
  */
 zilog_t *
 zil_open(objset_t *os, zil_get_data_t *get_data, zil_sums_t *zil_sums)
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	ASSERT3P(zilog->zl_get_data, ==, NULL);
 	ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL);
 	ASSERT(list_is_empty(&zilog->zl_lwb_list));
 
 	zilog->zl_get_data = get_data;
 	zilog->zl_sums = zil_sums;
 
 	return (zilog);
 }
 
 /*
  * Close an intent log.
  */
 void
 zil_close(zilog_t *zilog)
 {
 	lwb_t *lwb;
 	uint64_t txg;
 
 	if (!dmu_objset_is_snapshot(zilog->zl_os)) {
 		zil_commit(zilog, 0);
 	} else {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT0(zilog->zl_dirty_max_txg);
 		ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE);
 	}
 
 	mutex_enter(&zilog->zl_lock);
 	txg = zilog->zl_dirty_max_txg;
 	lwb = list_tail(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		txg = MAX(txg, lwb->lwb_alloc_txg);
 		txg = MAX(txg, lwb->lwb_max_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends
 	 * on the time when the dmu_tx transaction is assigned in
 	 * zil_lwb_write_issue().
 	 */
 	mutex_enter(&zilog->zl_lwb_io_lock);
 	txg = MAX(zilog->zl_lwb_max_issued_txg, txg);
 	mutex_exit(&zilog->zl_lwb_io_lock);
 
 	/*
 	 * We need to use txg_wait_synced() to wait until that txg is synced.
 	 * zil_sync() will guarantee all lwbs up to that txg have been
 	 * written out, flushed, and cleaned.
 	 */
 	if (txg != 0)
 		txg_wait_synced(zilog->zl_dmu_pool, txg);
 
 	if (zilog_is_dirty(zilog))
 		zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog,
 		    (u_longlong_t)txg);
 	if (txg < spa_freeze_txg(zilog->zl_spa))
 		VERIFY(!zilog_is_dirty(zilog));
 
 	zilog->zl_get_data = NULL;
 
 	/*
 	 * We should have only one lwb left on the list; remove it now.
 	 */
 	mutex_enter(&zilog->zl_lock);
 	lwb = list_remove_head(&zilog->zl_lwb_list);
 	if (lwb != NULL) {
 		ASSERT(list_is_empty(&zilog->zl_lwb_list));
 		ASSERT3S(lwb->lwb_state, ==, LWB_STATE_NEW);
 		zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 		zil_free_lwb(zilog, lwb);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
 
 static const char *suspend_tag = "zil suspending";
 
 /*
  * Suspend an intent log.  While in suspended mode, we still honor
  * synchronous semantics, but we rely on txg_wait_synced() to do it.
  * On old version pools, we suspend the log briefly when taking a
  * snapshot so that it will have an empty intent log.
  *
  * Long holds are not really intended to be used the way we do here --
  * held for such a short time.  A concurrent caller of dsl_dataset_long_held()
  * could fail.  Therefore we take pains to only put a long hold if it is
  * actually necessary.  Fortunately, it will only be necessary if the
  * objset is currently mounted (or the ZVOL equivalent).  In that case it
  * will already have a long hold, so we are not really making things any worse.
  *
  * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or
  * zvol_state_t), and use their mechanism to prevent their hold from being
  * dropped (e.g. VFS_HOLD()).  However, that would be even more pain for
  * very little gain.
  *
  * if cookiep == NULL, this does both the suspend & resume.
  * Otherwise, it returns with the dataset "long held", and the cookie
  * should be passed into zil_resume().
  */
 int
 zil_suspend(const char *osname, void **cookiep)
 {
 	objset_t *os;
 	zilog_t *zilog;
 	const zil_header_t *zh;
 	int error;
 
 	error = dmu_objset_hold(osname, suspend_tag, &os);
 	if (error != 0)
 		return (error);
 	zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	zh = zilog->zl_header;
 
 	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (SET_ERROR(EBUSY));
 	}
 
 	/*
 	 * Don't put a long hold in the cases where we can avoid it.  This
 	 * is when there is no cookie so we are doing a suspend & resume
 	 * (i.e. called from zil_vdev_offline()), and there's nothing to do
 	 * for the suspend because it's already suspended, or there's no ZIL.
 	 */
 	if (cookiep == NULL && !zilog->zl_suspending &&
 	    (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
 		mutex_exit(&zilog->zl_lock);
 		dmu_objset_rele(os, suspend_tag);
 		return (0);
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag);
 	dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
 
 	zilog->zl_suspend++;
 
 	if (zilog->zl_suspend > 1) {
 		/*
 		 * Someone else is already suspending it.
 		 * Just wait for them to finish.
 		 */
 
 		while (zilog->zl_suspending)
 			cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock);
 		mutex_exit(&zilog->zl_lock);
 
 		if (cookiep == NULL)
 			zil_resume(os);
 		else
 			*cookiep = os;
 		return (0);
 	}
 
 	/*
 	 * If there is no pointer to an on-disk block, this ZIL must not
 	 * be active (e.g. filesystem not mounted), so there's nothing
 	 * to clean up.
 	 */
 	if (BP_IS_HOLE(&zh->zh_log)) {
 		ASSERT(cookiep != NULL); /* fast path already handled */
 
 		*cookiep = os;
 		mutex_exit(&zilog->zl_lock);
 		return (0);
 	}
 
 	/*
 	 * The ZIL has work to do. Ensure that the associated encryption
 	 * key will remain mapped while we are committing the log by
 	 * grabbing a reference to it. If the key isn't loaded we have no
 	 * choice but to return an error until the wrapping key is loaded.
 	 */
 	if (os->os_encrypted &&
 	    dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) {
 		zilog->zl_suspend--;
 		mutex_exit(&zilog->zl_lock);
 		dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 		dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 		return (SET_ERROR(EACCES));
 	}
 
 	zilog->zl_suspending = B_TRUE;
 	mutex_exit(&zilog->zl_lock);
 
 	/*
 	 * We need to use zil_commit_impl to ensure we wait for all
 	 * LWB_STATE_OPENED, _CLOSED and _READY lwbs to be committed
 	 * to disk before proceeding. If we used zil_commit instead, it
 	 * would just call txg_wait_synced(), because zl_suspend is set.
 	 * txg_wait_synced() doesn't wait for these lwb's to be
 	 * LWB_STATE_FLUSH_DONE before returning.
 	 */
 	zil_commit_impl(zilog, 0);
 
 	/*
 	 * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we
 	 * use txg_wait_synced() to ensure the data from the zilog has
 	 * migrated to the main pool before calling zil_destroy().
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zil_destroy(zilog, B_FALSE);
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_suspending = B_FALSE;
 	cv_broadcast(&zilog->zl_cv_suspend);
 	mutex_exit(&zilog->zl_lock);
 
 	if (os->os_encrypted)
 		dsl_dataset_remove_key_mapping(dmu_objset_ds(os));
 
 	if (cookiep == NULL)
 		zil_resume(os);
 	else
 		*cookiep = os;
 	return (0);
 }
 
 void
 zil_resume(void *cookie)
 {
 	objset_t *os = cookie;
 	zilog_t *zilog = dmu_objset_zil(os);
 
 	mutex_enter(&zilog->zl_lock);
 	ASSERT(zilog->zl_suspend != 0);
 	zilog->zl_suspend--;
 	mutex_exit(&zilog->zl_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag);
 	dsl_dataset_rele(dmu_objset_ds(os), suspend_tag);
 }
 
 typedef struct zil_replay_arg {
 	zil_replay_func_t *const *zr_replay;
 	void		*zr_arg;
 	boolean_t	zr_byteswap;
 	char		*zr_lr;
 } zil_replay_arg_t;
 
 static int
 zil_replay_error(zilog_t *zilog, const lr_t *lr, int error)
 {
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
 
 	dmu_objset_name(zilog->zl_os, name);
 
 	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
 	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
 	    (u_longlong_t)lr->lrc_seq,
 	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
 	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
 
 	return (error);
 }
 
 static int
 zil_replay_log_record(zilog_t *zilog, const lr_t *lr, void *zra,
     uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
 	int error = 0;
 
 	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
 		return (0);
 
 	if (lr->lrc_txg < claim_txg)		/* already committed */
 		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
 	if (txtype == 0 || txtype >= TX_MAX_TYPE)
 		return (zil_replay_error(zilog, lr, EINVAL));
 
 	/*
 	 * If this record type can be logged out of order, the object
 	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
 	if (TX_OOO(txtype)) {
 		error = dmu_object_info(zilog->zl_os,
 		    LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL);
 		if (error == ENOENT || error == EEXIST)
 			return (0);
 	}
 
 	/*
 	 * Make a copy of the data so we can revise and extend it.
 	 */
 	memcpy(zr->zr_lr, lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
 		error = zil_read_log_data(zilog, (lr_write_t *)lr,
 		    zr->zr_lr + reclen);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
 	 * The log block containing this lr may have been byteswapped
 	 * so that we can easily examine common fields like lrc_txtype.
 	 * However, the log is a mix of different record types, and only the
 	 * replay vectors know how to byteswap their records.  Therefore, if
 	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
 	if (zr->zr_byteswap)
 		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
 	 * and update the log header sequence number to reflect the fact that
 	 * we did so. At the end of each replay function the sequence number
 	 * is updated if we are in replay mode.
 	 */
 	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
 	if (error != 0) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
 		 * EEXIST. So if we receive any error we try syncing out
 		 * any removes then retry the transaction.  Note that we
 		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
 		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
 		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
 		if (error != 0)
 			return (zil_replay_error(zilog, lr, error));
 	}
 	return (0);
 }
 
 static int
 zil_incr_blks(zilog_t *zilog, const blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	(void) bp, (void) arg, (void) claim_txg;
 
 	zilog->zl_replay_blks++;
 
 	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  * Return B_TRUE if there were any entries to replay.
  */
 boolean_t
 zil_replay(objset_t *os, void *arg,
     zil_replay_func_t *const replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
 	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		return (zil_destroy(zilog, B_TRUE));
 	}
 
 	zr.zr_replay = replay_func;
 	zr.zr_arg = arg;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
 	zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
 	zilog->zl_replay = B_TRUE;
 	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg, B_TRUE);
 	vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 	zilog->zl_replay = B_FALSE;
 
 	return (B_TRUE);
 }
 
 boolean_t
 zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
 	if (zilog->zl_sync == ZFS_SYNC_DISABLED)
 		return (B_TRUE);
 
 	if (zilog->zl_replay) {
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
 		    zilog->zl_replaying_seq;
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 int
 zil_reset(const char *osname, void *arg)
 {
 	(void) arg;
 
 	int error = zil_suspend(osname, NULL);
 	/* EACCES means crypto key not loaded */
 	if ((error == EACCES) || (error == EBUSY))
 		return (SET_ERROR(error));
 	if (error != 0)
 		return (SET_ERROR(EEXIST));
 	return (0);
 }
 
 EXPORT_SYMBOL(zil_alloc);
 EXPORT_SYMBOL(zil_free);
 EXPORT_SYMBOL(zil_open);
 EXPORT_SYMBOL(zil_close);
 EXPORT_SYMBOL(zil_replay);
 EXPORT_SYMBOL(zil_replaying);
 EXPORT_SYMBOL(zil_destroy);
 EXPORT_SYMBOL(zil_destroy_sync);
 EXPORT_SYMBOL(zil_itx_create);
 EXPORT_SYMBOL(zil_itx_destroy);
 EXPORT_SYMBOL(zil_itx_assign);
 EXPORT_SYMBOL(zil_commit);
 EXPORT_SYMBOL(zil_claim);
 EXPORT_SYMBOL(zil_check_log_chain);
 EXPORT_SYMBOL(zil_sync);
 EXPORT_SYMBOL(zil_clean);
 EXPORT_SYMBOL(zil_suspend);
 EXPORT_SYMBOL(zil_resume);
 EXPORT_SYMBOL(zil_lwb_add_block);
 EXPORT_SYMBOL(zil_bp_tree_add);
 EXPORT_SYMBOL(zil_set_sync);
 EXPORT_SYMBOL(zil_set_logbias);
 EXPORT_SYMBOL(zil_sums_init);
 EXPORT_SYMBOL(zil_sums_fini);
 EXPORT_SYMBOL(zil_kstat_values_update);
 
 ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, UINT, ZMOD_RW,
 	"ZIL block open timeout percentage");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW,
 	"Disable intent logging replay");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW,
 	"Disable ZIL cache flushes");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, U64, ZMOD_RW,
 	"Limit in bytes slog sync writes per commit");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, UINT, ZMOD_RW,
 	"Limit in bytes of ZIL log block size");
 
 ZFS_MODULE_PARAM(zfs_zil, zil_, maxcopied, UINT, ZMOD_RW,
 	"Limit in bytes WR_COPIED size");
diff --git a/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in b/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in
index 4cc075585d4b..30524474d1ac 100644
--- a/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in
+++ b/sys/contrib/openzfs/rpm/generic/zfs-kmod.spec.in
@@ -1,195 +1,213 @@
 %define module  @PACKAGE@
 
 %if !%{defined ksrc}
 %if 0%{?rhel}%{?fedora}%{?openEuler}
 %define ksrc    ${kernel_version##*___}
 %else
 %define ksrc    "$( \
         if [ -e "/usr/src/linux-${kernel_version%%___*}" ]; then \
             echo "/usr/src/linux-${kernel_version%%___*}"; \
         elif [ -e "/lib/modules/${kernel_version%%___*}/source" ]; then \
             echo "/lib/modules/${kernel_version%%___*}/source"; \
         else \
             echo "/lib/modules/${kernel_version%%___*}/build"; \
         fi)"
 %endif
 %endif
 
 %if !%{defined kobj}
 %if 0%{?rhel}%{?fedora}%{?openEuler}
 %define kobj    ${kernel_version##*___}
 %else
 %define kobj    "$( \
         if [ -e "/usr/src/linux-${kernel_version%%___*}" ]; then \
             echo "/usr/src/linux-${kernel_version%%___*}"; \
         else \
             echo "/lib/modules/${kernel_version%%___*}/build"; \
         fi)"
 %endif
 %endif
 
 #define repo    rpmfusion
 #define repo    chaos
 
 # (un)define the next line to either build for the newest or all current kernels
 %define buildforkernels newest
 #define buildforkernels current
 #define buildforkernels akmod
 
 %bcond_with     debug
 %bcond_with     debuginfo
 
 
 Name:           %{module}-kmod
 
 Version:        @VERSION@
 Release:        @RELEASE@%{?dist}
 Summary:        Kernel module(s)
 
 Group:          System Environment/Kernel
 License:        @ZFS_META_LICENSE@
 URL:            https://github.com/openzfs/zfs
 Source0:        %{module}-%{version}.tar.gz
 Source10:       kmodtool
 BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id} -u -n)
 %if 0%{?rhel}%{?fedora}%{?openEuler}
 BuildRequires:  gcc, make
 BuildRequires:  elfutils-libelf-devel
 %endif
 
 %if (0%{?fedora}%{?suse_version}%{?openEuler}) || (0%{?rhel} && 0%{?rhel} < 9)
 # We don't directly use it, but if this isn't installed, rpmbuild as root can
 # crash+corrupt rpmdb
 # See issue #12071
 BuildRequires:  ncompress
 %endif
 
 # The developments headers will conflict with the dkms packages.
 Conflicts:      %{module}-dkms
 
 %if %{defined repo}
 
 # Building for a repository use the proper build-sysbuild package
 # to determine which kernel-devel packages should be installed.
 BuildRequires:  %{_bindir}/kmodtool
 %{!?kernels:BuildRequires: buildsys-build-%{repo}-kerneldevpkgs-%{?buildforkernels:%{buildforkernels}}%{!?buildforkernels:current}-%{_target_cpu}}
 
 %else
 
 # Building local packages attempt to to use the installed kernel.
 %{?rhel:BuildRequires: kernel-devel}
 %{?fedora:BuildRequires: kernel-devel}
 %{?openEuler:BuildRequires: kernel-devel}
 %{?suse_version:BuildRequires: kernel-source}
 
 %if !%{defined kernels} && !%{defined build_src_rpm}
     %if 0%{?rhel}%{?fedora}%{?suse_version}%{?openEuler}
         %define kernels %(ls -1 /usr/src/kernels)
     %else
         %define kernels %(ls -1 /lib/modules)
     %endif
 %endif
 %endif
 
 # LDFLAGS are not sanitized by arch/*/Makefile for these architectures.
 %ifarch ppc ppc64 ppc64le aarch64
 %global __global_ldflags %{nil}
 %endif
 
 # Kmodtool does its magic here.  A patched version of kmodtool is shipped
 # with the source rpm until kmod development packages are supported upstream.
 # https://bugzilla.rpmfusion.org/show_bug.cgi?id=2714
 %{expand:%(bash %{SOURCE10} --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{name} %{?buildforkernels:--%{buildforkernels}} --devel %{?prefix:--prefix "%{?prefix}"} %{?kernels:--for-kernels "%{?kernels}"} %{?kernelbuildroot:--buildroot "%{?kernelbuildroot}"} 2>/dev/null) }
 
 
 %description
 This package contains the ZFS kernel modules.
 
 %prep
 # Error out if there was something wrong with kmodtool.
 %{?kmodtool_check}
 
 # Print kmodtool output for debugging purposes:
 bash %{SOURCE10}  --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{name} %{?buildforkernels:--%{buildforkernels}} --devel %{?prefix:--prefix "%{?prefix}"} %{?kernels:--for-kernels "%{?kernels}"} %{?kernelbuildroot:--buildroot "%{?kernelbuildroot}"} 2>/dev/null
 
 %if %{with debug}
     %define debug --enable-debug
 %else
     %define debug --disable-debug
 %endif
 
 %if %{with debuginfo}
     %define debuginfo --enable-debuginfo
 %else
     %define debuginfo --disable-debuginfo
 %endif
 
 # Leverage VPATH from configure to avoid making multiple copies.
 %define _configure ../%{module}-%{version}/configure
 
 %setup -q -c -T -a 0
 
 for kernel_version in %{?kernel_versions}; do
     %{__mkdir} _kmod_build_${kernel_version%%___*}
 done
 
 %build
 for kernel_version in %{?kernel_versions}; do
     cd _kmod_build_${kernel_version%%___*}
     %configure \
         --with-config=kernel \
         --with-linux=%{ksrc} \
         --with-linux-obj=%{kobj} \
         %{debug} \
         %{debuginfo} \
         %{?kernel_cc} \
         %{?kernel_ld} \
         %{?kernel_llvm}
+
+    # Pre-6.10 kernel builds didn't need to copy over the source files to the
+    # build directory.  However we do need to do it though post-6.10 due to
+    # these commits:
+    #
+    # b1992c3772e6 kbuild: use $(src) instead of $(srctree)/$(src) for source
+    #                      directory
+    #
+    # 9a0ebe5011f4 kbuild: use $(obj)/ instead of $(src)/ for common pattern
+    #                      rules
+    #
+    # Note that kmodtool actually copies over the source into the build
+    # directory, so what we're doing here is normal.  For efficiency reasons
+    # though we just use hardlinks instead of copying.
+    #
+    # See https://github.com/openzfs/zfs/issues/16439 for more info.
+    cp -lR ../%{module}-%{version}/module/* module/
+
     make %{?_smp_mflags}
     cd ..
 done
 
 
 # Module signing (modsign)
 #
 # This must be run _after_ find-debuginfo.sh runs, otherwise that will strip
 # the signature off of the modules.
 # (Based on Fedora's kernel.spec workaround)
 %define __modsign_install_post \
     sign_pem="%{ksrc}/certs/signing_key.pem"; \
     sign_x509="%{ksrc}/certs/signing_key.x509"; \
     if [ -f "${sign_x509}" ]\
     then \
         echo "Signing kernel modules ..."; \
         for kmod in $(find ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/ -name \*.ko); do \
             %{ksrc}/scripts/sign-file sha256 ${sign_pem} ${sign_x509} ${kmod}; \
         done \
     fi \
 %{nil}
 
 # hack to ensure signing happens after find-debuginfo.sh runs
 %define __spec_install_post \
     %{?__debug_package:%{__debug_install_post}}\
     %{__arch_install_post}\
     %{__os_install_post}\
     %{__modsign_install_post}
 
 %install
 rm -rf ${RPM_BUILD_ROOT}
 
 # Relies on the kernel 'modules_install' make target.
 for kernel_version in %{?kernel_versions}; do
     cd _kmod_build_${kernel_version%%___*}
     make install \
         DESTDIR=${RPM_BUILD_ROOT} \
         %{?prefix:INSTALL_MOD_PATH=%{?prefix}} \
         INSTALL_MOD_DIR=%{kmodinstdir_postfix}
     cd ..
 done
 # find-debuginfo.sh only considers executables
 chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/*/*
 %{?akmod_install}
 
 
 %clean
 rm -rf $RPM_BUILD_ROOT
diff --git a/sys/contrib/openzfs/rpm/generic/zfs.spec.in b/sys/contrib/openzfs/rpm/generic/zfs.spec.in
index 2e89abd0edfd..c7a00c61f6bb 100644
--- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in
+++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in
@@ -1,589 +1,590 @@
 %global _sbindir    /sbin
 %global _libdir     /%{_lib}
 
 # Set the default udev directory based on distribution.
 %if %{undefined _udevdir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _udevdir    %{_prefix}/lib/udev
 %else
 %global _udevdir    /lib/udev
 %endif
 %endif
 
 # Set the default udevrule directory based on distribution.
 %if %{undefined _udevruledir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _udevruledir    %{_prefix}/lib/udev/rules.d
 %else
 %global _udevruledir    /lib/udev/rules.d
 %endif
 %endif
 
 # Set the default _bashcompletiondir directory based on distribution.
 %if %{undefined _bashcompletiondir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _bashcompletiondir    /etc/bash_completion.d
 %else
 %global _bashcompletiondir    /usr/share/bash-completion
 %endif
 %endif
 
 # Set the default dracut directory based on distribution.
 %if %{undefined _dracutdir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _dracutdir  %{_prefix}/lib/dracut
 %else
 %global _dracutdir  %{_prefix}/share/dracut
 %endif
 %endif
 
 %if %{undefined _initconfdir}
 %global _initconfdir /etc/sysconfig
 %endif
 
 %if %{undefined _unitdir}
 %global _unitdir %{_prefix}/lib/systemd/system
 %endif
 
 %if %{undefined _presetdir}
 %global _presetdir %{_prefix}/lib/systemd/system-preset
 %endif
 
 %if %{undefined _modulesloaddir}
 %global _modulesloaddir %{_prefix}/lib/modules-load.d
 %endif
 
 %if %{undefined _systemdgeneratordir}
 %global _systemdgeneratordir %{_prefix}/lib/systemd/system-generators
 %endif
 
 %if %{undefined _pkgconfigdir}
 %global _pkgconfigdir %{_prefix}/%{_lib}/pkgconfig
 %endif
 
 %bcond_with    debug
 %bcond_with    debuginfo
 %bcond_with    asan
 %bcond_with    ubsan
 %bcond_with    systemd
 %bcond_with    pam
 %bcond_without pyzfs
 
 # Generic enable switch for systemd
 %if %{with systemd}
 %define _systemd 1
 %endif
 
 # Distros below support systemd
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %define _systemd 1
 %endif
 
 # When not specified default to distribution provided version.
 %if %{undefined __use_python}
 %define __python                  /usr/bin/python3
 %define __python_pkg_version      3
 %else
 %define __python                  %{__use_python}
 %define __python_pkg_version      %{__use_python_pkg_version}
 %endif
 %define __python_sitelib          %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())" 2>/dev/null || %{__python} -Esc "import sysconfig; print(sysconfig.get_path('purelib'))")
 
 Name:           @PACKAGE@
 Version:        @VERSION@
 Release:        @RELEASE@%{?dist}
 Summary:        Commands to control the kernel modules and libraries
 
 Group:          System Environment/Kernel
 License:        @ZFS_META_LICENSE@
 URL:            https://github.com/openzfs/zfs
 Source0:        %{name}-%{version}.tar.gz
 BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 Requires:       libzpool5%{?_isa} = %{version}-%{release}
 Requires:       libnvpair3%{?_isa} = %{version}-%{release}
 Requires:       libuutil3%{?_isa} = %{version}-%{release}
 Requires:       libzfs5%{?_isa} = %{version}-%{release}
 Requires:       %{name}-kmod = %{version}
 Provides:       %{name}-kmod-common = %{version}-%{release}
 Obsoletes:      spl <= %{version}
 
 # zfs-fuse provides the same commands and man pages that OpenZFS does.
 # Renaming those on either side would conflict with all available documentation.
 Conflicts:      zfs-fuse
 
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 BuildRequires:  gcc, make
 BuildRequires:  zlib-devel
 BuildRequires:  libuuid-devel
 BuildRequires:  libblkid-devel
 BuildRequires:  libudev-devel
 BuildRequires:  libattr-devel
 BuildRequires:  openssl-devel
 %if 0%{?fedora}%{?suse_version}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8
 BuildRequires:  libtirpc-devel
 %endif
 
 %if (0%{?fedora}%{?suse_version}%{?openEuler}) || (0%{?rhel} && 0%{?rhel} < 9)
 # We don't directly use it, but if this isn't installed, rpmbuild as root can
 # crash+corrupt rpmdb
 # See issue #12071
 BuildRequires:  ncompress
 %endif
 
 Requires:       openssl
 %if 0%{?_systemd}
 BuildRequires: systemd
 %endif
 
 %endif
 
 %if 0%{?_systemd}
 Requires(post): systemd
 Requires(preun): systemd
 Requires(postun): systemd
 %endif
 
 # The zpool iostat/status -c scripts call some utilities like lsblk and iostat
 Requires:  util-linux
 Requires:  sysstat
 
 %description
 This package contains the core ZFS command line utilities.
 
 %package -n libzpool5
 Summary:        Native ZFS pool library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libzpool2 <= %{version}
 Obsoletes:      libzpool4 <= %{version}
 
 %description -n libzpool5
 This package contains the zpool library, which provides support
 for managing zpools
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libzpool5
 %else
 %post -n libzpool5 -p /sbin/ldconfig
 %postun -n libzpool5 -p /sbin/ldconfig
 %endif
 
 %package -n libnvpair3
 Summary:        Solaris name-value library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libnvpair1 <= %{version}
 
 %description -n libnvpair3
 This package contains routines for packing and unpacking name-value
 pairs.  This functionality is used to portably transport data across
 process boundaries, between kernel and user space, and can be used
 to write self describing data structures on disk.
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libnvpair3
 %else
 %post -n libnvpair3 -p /sbin/ldconfig
 %postun -n libnvpair3 -p /sbin/ldconfig
 %endif
 
 %package -n libuutil3
 Summary:        Solaris userland utility library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libuutil1 <= %{version}
 
 %description -n libuutil3
 This library provides a variety of compatibility functions for OpenZFS:
  * libspl: The Solaris Porting Layer userland library, which provides APIs
    that make it possible to run Solaris user code in a Linux environment
    with relatively minimal modification.
  * libavl: The Adelson-Velskii Landis balanced binary tree manipulation
    library.
  * libefi: The Extensible Firmware Interface library for GUID disk
    partitioning.
  * libshare: NFS, SMB, and iSCSI service integration for ZFS.
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libuutil3
 %else
 %post -n libuutil3 -p /sbin/ldconfig
 %postun -n libuutil3 -p /sbin/ldconfig
 %endif
 
 # The library version is encoded in the package name.  When updating the
 # version information it is important to add an obsoletes line below for
 # the previous version of the package.
 %package -n libzfs5
 Summary:        Native ZFS filesystem library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libzfs2 <= %{version}
 Obsoletes:      libzfs4 <= %{version}
 
 %description -n libzfs5
 This package provides support for managing ZFS filesystems
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libzfs5
 %else
 %post -n libzfs5 -p /sbin/ldconfig
 %postun -n libzfs5 -p /sbin/ldconfig
 %endif
 
 %package -n libzfs5-devel
 Summary:        Development headers
 Group:          System Environment/Kernel
 Requires:       libzfs5%{?_isa} = %{version}-%{release}
 Requires:       libzpool5%{?_isa} = %{version}-%{release}
 Requires:       libnvpair3%{?_isa} = %{version}-%{release}
 Requires:       libuutil3%{?_isa} = %{version}-%{release}
 Provides:       libzpool5-devel = %{version}-%{release}
 Provides:       libnvpair3-devel = %{version}-%{release}
 Provides:       libuutil3-devel = %{version}-%{release}
 Obsoletes:      zfs-devel <= %{version}
 Obsoletes:      libzfs2-devel <= %{version}
 Obsoletes:      libzfs4-devel <= %{version}
 
 %description -n libzfs5-devel
 This package contains the header files needed for building additional
 applications against the ZFS libraries.
 
 %package test
 Summary:        Test infrastructure
 Group:          System Environment/Kernel
 Requires:       %{name}%{?_isa} = %{version}-%{release}
 Requires:       parted
 Requires:       lsscsi
 Requires:       mdadm
 Requires:       bc
 Requires:       ksh
 Requires:       fio
 Requires:       acl
 Requires:       sudo
 Requires:       sysstat
 Requires:       libaio
 Requires:       python%{__python_pkg_version}
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 BuildRequires:  libaio-devel
 %endif
 AutoReqProv:    no
 
 %description test
 This package contains test infrastructure and support scripts for
 validating the file system.
 
 %package dracut
 Summary:        Dracut module
 Group:          System Environment/Kernel
 BuildArch:	noarch
 Requires:       %{name} >= %{version}
 Requires:       dracut
 Requires:       /usr/bin/awk
 Requires:       grep
 
 %description dracut
 This package contains a dracut module used to construct an initramfs
 image which is ZFS aware.
 
 %if %{with pyzfs}
 # Enforce `python36-` package prefix for CentOS 7
 # since dependencies come from EPEL and are named this way
 %package -n python%{__python_pkg_version}-pyzfs
 Summary:        Python %{python_version} wrapper for libzfs_core
 Group:          Development/Languages/Python
 License:        Apache-2.0
 BuildArch:      noarch
 Requires:       libzfs5 = %{version}-%{release}
 Requires:       libnvpair3 = %{version}-%{release}
 Requires:       libffi
 Requires:       python%{__python_pkg_version}
 
 %if 0%{?centos} == 7
 Requires:       python36-cffi
 %else
 Requires:       python%{__python_pkg_version}-cffi
 %endif
 
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 %if 0%{?centos} == 7
 BuildRequires:  python36-packaging
 BuildRequires:  python36-devel
 BuildRequires:  python36-cffi
 BuildRequires:  python36-setuptools
 %else
 BuildRequires:  python%{__python_pkg_version}-packaging
 BuildRequires:  python%{__python_pkg_version}-devel
 BuildRequires:  python%{__python_pkg_version}-cffi
 BuildRequires:  python%{__python_pkg_version}-setuptools
 %endif
 
 BuildRequires:  libffi-devel
 %endif
 
 %description -n python%{__python_pkg_version}-pyzfs
 This package provides a python wrapper for the libzfs_core C library.
 %endif
 
 %if 0%{?_initramfs}
 %package initramfs
 Summary:        Initramfs module
 Group:          System Environment/Kernel
 Requires:       %{name}%{?_isa} = %{version}-%{release}
 Requires:       initramfs-tools
 
 %description initramfs
 This package contains a initramfs module used to construct an initramfs
 image which is ZFS aware.
 %endif
 
 %if %{with pam}
 %package -n pam_zfs_key
 Summary:        PAM module for encrypted ZFS datasets
 
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 BuildRequires:  pam-devel
 %endif
 
 %description -n pam_zfs_key
 This package contains the pam_zfs_key PAM module, which provides
 support for unlocking datasets on user login.
 %endif
 
 %prep
 %if %{with debug}
     %define debug --enable-debug
 %else
     %define debug --disable-debug
 %endif
 
 %if %{with debuginfo}
     %define debuginfo --enable-debuginfo
 %else
     %define debuginfo --disable-debuginfo
 %endif
 
 %if %{with asan}
     %define asan --enable-asan
 %else
     %define asan --disable-asan
 %endif
 
 %if %{with ubsan}
     %define ubsan --enable-ubsan
 %else
     %define ubsan --disable-ubsan
 %endif
 
 %if 0%{?_systemd}
     %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit
     %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target
 %else
     %define systemd --enable-sysvinit --disable-systemd
 %endif
 
 %if %{with pyzfs}
     %define pyzfs --enable-pyzfs
 %else
     %define pyzfs --disable-pyzfs
 %endif
 
 %if %{with pam}
     %define pam --enable-pam
 %else
     %define pam --disable-pam
 %endif
 
 %setup -q
 
 %build
 %configure \
     --with-config=user \
     --with-udevdir=%{_udevdir} \
     --with-udevruledir=%{_udevruledir} \
     --with-dracutdir=%{_dracutdir} \
     --with-pamconfigsdir=%{_datadir}/pam-configs \
     --with-pammoduledir=%{_libdir}/security \
     --with-python=%{__python} \
     --with-pkgconfigdir=%{_pkgconfigdir} \
     --disable-static \
     %{debug} \
     %{debuginfo} \
     %{asan} \
     %{ubsan} \
     %{systemd} \
     %{pam} \
     %{pyzfs}
 make %{?_smp_mflags}
 
 %install
 %{__rm} -rf $RPM_BUILD_ROOT
 make install DESTDIR=%{?buildroot}
 find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \;
 %if 0%{!?__brp_mangle_shebangs:1}
 find %{?buildroot}%{_bindir} \
     \( -name arc_summary -or -name arcstat -or -name dbufstat \
     -or -name zilstat \) \
     -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \;
 find %{?buildroot}%{_datadir} \
     \( -name test-runner.py -or -name zts-report.py \) \
     -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \;
 %endif
 
 %post
 %if 0%{?_systemd}
 %if 0%{?systemd_post:1}
 %systemd_post %{systemd_svcs}
 %else
 if [ "$1" = "1" -o "$1" = "install" ] ; then
     # Initial installation
     systemctl preset %{systemd_svcs} >/dev/null || true
 fi
 %endif
 %else
 if [ -x /sbin/chkconfig ]; then
     /sbin/chkconfig --add zfs-import
     /sbin/chkconfig --add zfs-load-key
     /sbin/chkconfig --add zfs-mount
     /sbin/chkconfig --add zfs-share
     /sbin/chkconfig --add zfs-zed
 fi
 %endif
 exit 0
 
 # On RHEL/CentOS 7 the static nodes aren't refreshed by default after
 # installing a package.  This is the default behavior for Fedora.
 %posttrans
 %if 0%{?rhel} == 7 || 0%{?centos} == 7
 systemctl restart kmod-static-nodes
 systemctl restart systemd-tmpfiles-setup-dev
 udevadm trigger
 %endif
 
 %preun
 %if 0%{?_systemd}
 %if 0%{?systemd_preun:1}
 %systemd_preun %{systemd_svcs}
 %else
 if [ "$1" = "0" -o "$1" = "remove" ] ; then
     # Package removal, not upgrade
     systemctl --no-reload disable %{systemd_svcs} >/dev/null || true
     systemctl stop %{systemd_svcs} >/dev/null || true
 fi
 %endif
 %else
 if [ "$1" = "0" -o "$1" = "remove" ] && [ -x /sbin/chkconfig ]; then
     /sbin/chkconfig --del zfs-import
     /sbin/chkconfig --del zfs-load-key
     /sbin/chkconfig --del zfs-mount
     /sbin/chkconfig --del zfs-share
     /sbin/chkconfig --del zfs-zed
 fi
 %endif
 exit 0
 
 %postun
 %if 0%{?_systemd}
 %if 0%{?systemd_postun:1}
 %systemd_postun %{systemd_svcs}
 %else
 systemctl --system daemon-reload >/dev/null || true
 %endif
 %endif
 
 %files
 # Core utilities
 %{_sbindir}/*
 %{_bindir}/raidz_test
 %{_sbindir}/zgenhostid
 %{_bindir}/zvol_wait
 # Optional Python 3 scripts
 %{_bindir}/arc_summary
 %{_bindir}/arcstat
 %{_bindir}/dbufstat
 %{_bindir}/zilstat
 # Man pages
 %{_mandir}/man1/*
 %{_mandir}/man4/*
 %{_mandir}/man5/*
 %{_mandir}/man7/*
 %{_mandir}/man8/*
 # Configuration files and scripts
 %{_libexecdir}/%{name}
 %{_udevdir}/vdev_id
 %{_udevdir}/zvol_id
 %{_udevdir}/rules.d/*
 %{_datadir}/%{name}/compatibility.d
 %if ! 0%{?_systemd} || 0%{?_initramfs}
 # Files needed for sysvinit and initramfs-tools
 %{_sysconfdir}/%{name}/zfs-functions
 %config(noreplace) %{_initconfdir}/zfs
 %else
 %exclude %{_sysconfdir}/%{name}/zfs-functions
 %exclude %{_initconfdir}/zfs
 %endif
 %if 0%{?_systemd}
 %{_unitdir}/*
 %{_presetdir}/*
 %{_modulesloaddir}/*
 %{_systemdgeneratordir}/*
 %else
 %config(noreplace) %{_sysconfdir}/init.d/*
 %endif
 %config(noreplace) %{_sysconfdir}/%{name}/zed.d/*
 %config(noreplace) %{_sysconfdir}/%{name}/zpool.d/*
 %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example
 %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*
 
 %config(noreplace) %{_bashcompletiondir}/zfs
+%config(noreplace) %{_bashcompletiondir}/zpool
 
 %files -n libzpool5
 %{_libdir}/libzpool.so.*
 
 %files -n libnvpair3
 %{_libdir}/libnvpair.so.*
 
 %files -n libuutil3
 %{_libdir}/libuutil.so.*
 
 %files -n libzfs5
 %{_libdir}/libzfs*.so.*
 
 %files -n libzfs5-devel
 %{_pkgconfigdir}/libzfs.pc
 %{_pkgconfigdir}/libzfsbootenv.pc
 %{_pkgconfigdir}/libzfs_core.pc
 %{_libdir}/*.so
 %{_includedir}/*
 %doc AUTHORS COPYRIGHT LICENSE NOTICE README.md
 
 %files test
 %{_datadir}/%{name}/zfs-tests
 %{_datadir}/%{name}/test-runner
 %{_datadir}/%{name}/runfiles
 %{_datadir}/%{name}/*.sh
 
 %files dracut
 %doc contrib/dracut/README.md
 %{_dracutdir}/modules.d/*
 
 %if %{with pyzfs}
 %files -n python%{__python_pkg_version}-pyzfs
 %doc contrib/pyzfs/README
 %doc contrib/pyzfs/LICENSE
 %defattr(-,root,root,-)
 %{__python_sitelib}/libzfs_core/*
 %{__python_sitelib}/pyzfs*
 %endif
 
 %if 0%{?_initramfs}
 %files initramfs
 %doc contrib/initramfs/README.md
 /usr/share/initramfs-tools/*
 %else
 # Since we're not building the initramfs package,
 # ignore those files.
 %exclude /usr/share/initramfs-tools
 %endif
 
 %if %{with pam}
 %files -n pam_zfs_key
 %{_libdir}/security/*
 %{_datadir}/pam-configs/*
 %endif
diff --git a/sys/contrib/openzfs/tests/runfiles/common.run b/sys/contrib/openzfs/tests/runfiles/common.run
index 3e1a3aeb6cbe..f302df81b919 100644
--- a/sys/contrib/openzfs/tests/runfiles/common.run
+++ b/sys/contrib/openzfs/tests/runfiles/common.run
@@ -1,1019 +1,1020 @@
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 # This run file contains all of the common functional tests.  When
 # adding a new test consider also adding it to the sanity.run file
 # if the new test runs to completion in only a few seconds.
 #
 # Approximate run time: 4-5 hours
 #
 
 [DEFAULT]
 pre = setup
 quiet = False
 pre_user = root
 user = root
 timeout = 600
 post_user = root
 post = cleanup
 failsafe_user = root
 failsafe = callbacks/zfs_failsafe
 outputdir = /var/tmp/test_results
 tags = ['functional']
 
 [tests/functional/acl/off]
 tests = ['dosmode', 'posixmode']
 tags = ['functional', 'acl']
 
 [tests/functional/alloc_class]
 tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos',
     'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos',
     'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos',
     'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos',
     'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos']
 tags = ['functional', 'alloc_class']
 
 [tests/functional/append]
 tests = ['file_append', 'threadsappend_001_pos']
 tags = ['functional', 'append']
 
 [tests/functional/arc]
 tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos',
     'arcstats_runtime_tuning']
 tags = ['functional', 'arc']
 
 [tests/functional/atime]
 tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on']
 tags = ['functional', 'atime']
 
 [tests/functional/bclone]
 tests = ['bclone_crossfs_corner_cases_limited',
     'bclone_crossfs_data',
     'bclone_crossfs_embedded',
     'bclone_crossfs_hole',
     'bclone_diffprops_all',
     'bclone_diffprops_checksum',
     'bclone_diffprops_compress',
     'bclone_diffprops_copies',
     'bclone_diffprops_recordsize',
     'bclone_prop_sync',
     'bclone_samefs_corner_cases_limited',
     'bclone_samefs_data',
     'bclone_samefs_embedded',
     'bclone_samefs_hole']
 tags = ['functional', 'bclone']
 timeout = 7200
 
 [tests/functional/block_cloning]
 tests = ['block_cloning_clone_mmap_cached',
     'block_cloning_copyfilerange',
     'block_cloning_copyfilerange_partial',
     'block_cloning_copyfilerange_fallback',
     'block_cloning_disabled_copyfilerange',
     'block_cloning_copyfilerange_cross_dataset',
     'block_cloning_cross_enc_dataset',
     'block_cloning_copyfilerange_fallback_same_txg',
     'block_cloning_replay', 'block_cloning_replay_encrypted',
-    'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write']
+    'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write',
+    'block_cloning_rlimit_fsize']
 tags = ['functional', 'block_cloning']
 
 [tests/functional/bootfs]
 tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos',
     'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos',
     'bootfs_008_pos']
 tags = ['functional', 'bootfs']
 
 [tests/functional/btree]
 tests = ['btree_positive', 'btree_negative']
 tags = ['functional', 'btree']
 pre =
 post =
 
 [tests/functional/cache]
 tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg',
     'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg',
     'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos']
 tags = ['functional', 'cache']
 
 [tests/functional/cachefile]
 tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos',
     'cachefile_004_pos']
 tags = ['functional', 'cachefile']
 
 [tests/functional/casenorm]
 tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure',
     'sensitive_none_lookup', 'sensitive_none_delete',
     'sensitive_formd_lookup', 'sensitive_formd_delete',
     'insensitive_none_lookup', 'insensitive_none_delete',
     'insensitive_formd_lookup', 'insensitive_formd_delete',
     'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete',
     'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete']
 tags = ['functional', 'casenorm']
 
 [tests/functional/channel_program/lua_core]
 tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists',
     'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg',
     'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries',
     'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua',
     'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large',
     'tst.return_nvlist_neg', 'tst.return_nvlist_pos',
     'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout']
 tags = ['functional', 'channel_program', 'lua_core']
 
 [tests/functional/channel_program/synctask_core]
 tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
     'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg',
     'tst.get_number_props', 'tst.get_string_props', 'tst.get_type',
     'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks',
     'tst.list_children', 'tst.list_clones', 'tst.list_holds',
     'tst.list_snapshots', 'tst.list_system_props',
     'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict',
     'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult',
     'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg',
     'tst.snapshot_recursive', 'tst.snapshot_rename', 'tst.snapshot_simple',
     'tst.bookmark.create', 'tst.bookmark.copy',
     'tst.terminate_by_signal'
     ]
 tags = ['functional', 'channel_program', 'synctask_core']
 
 [tests/functional/checksum]
 tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test',
     'filetest_001_pos', 'filetest_002_pos']
 tags = ['functional', 'checksum']
 
 [tests/functional/clean_mirror]
 tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
     'clean_mirror_003_pos', 'clean_mirror_004_pos']
 tags = ['functional', 'clean_mirror']
 
 [tests/functional/cli_root/zdb]
 tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos',
     'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos',
     'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress',
     'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum',
     'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id',
     'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zdb']
 timeout = 1200
 
 [tests/functional/cli_root/zfs]
 tests = ['zfs_001_neg', 'zfs_002_pos']
 tags = ['functional', 'cli_root', 'zfs']
 
 [tests/functional/cli_root/zfs_bookmark]
 tests = ['zfs_bookmark_cliargs']
 tags = ['functional', 'cli_root', 'zfs_bookmark']
 
 [tests/functional/cli_root/zfs_change-key]
 tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format',
     'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location',
     'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones']
 tags = ['functional', 'cli_root', 'zfs_change-key']
 
 [tests/functional/cli_root/zfs_clone]
 tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos',
     'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos',
     'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg',
     'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested',
     'zfs_clone_rm_nested']
 tags = ['functional', 'cli_root', 'zfs_clone']
 
 [tests/functional/cli_root/zfs_copies]
 tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos',
     'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos']
 tags = ['functional', 'cli_root', 'zfs_copies']
 
 [tests/functional/cli_root/zfs_create]
 tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos',
     'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos',
     'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg',
     'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos',
     'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted',
     'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_nomount',
     'zfs_create_verbose']
 tags = ['functional', 'cli_root', 'zfs_create']
 
 [tests/functional/cli_root/zfs_destroy]
 tests = ['zfs_clone_livelist_condense_and_disable',
     'zfs_clone_livelist_condense_races', 'zfs_clone_livelist_dedup',
     'zfs_destroy_001_pos', 'zfs_destroy_002_pos', 'zfs_destroy_003_pos',
     'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg',
     'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos',
     'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos',
     'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos',
     'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist',
     'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense']
 tags = ['functional', 'cli_root', 'zfs_destroy']
 
 [tests/functional/cli_root/zfs_diff]
 tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp',
     'zfs_diff_types', 'zfs_diff_encrypted', 'zfs_diff_mangle']
 tags = ['functional', 'cli_root', 'zfs_diff']
 
 [tests/functional/cli_root/zfs_get]
 tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos',
     'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg',
     'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg']
 tags = ['functional', 'cli_root', 'zfs_get']
 
 [tests/functional/cli_root/zfs_ids_to_path]
 tests = ['zfs_ids_to_path_001_pos']
 tags = ['functional', 'cli_root', 'zfs_ids_to_path']
 
 [tests/functional/cli_root/zfs_inherit]
 tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos',
     'zfs_inherit_mountpoint']
 tags = ['functional', 'cli_root', 'zfs_inherit']
 
 [tests/functional/cli_root/zfs_load-key]
 tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file',
     'zfs_load-key_https', 'zfs_load-key_location', 'zfs_load-key_noop',
     'zfs_load-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_load-key']
 
 [tests/functional/cli_root/zfs_mount]
 tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
     'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos',
     'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg',
     'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted',
     'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
     'zfs_mount_test_race', 'zfs_mount_recursive']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_program]
 tests = ['zfs_program_json']
 tags = ['functional', 'cli_root', 'zfs_program']
 
 [tests/functional/cli_root/zfs_promote]
 tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
     'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg',
     'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot']
 tags = ['functional', 'cli_root', 'zfs_promote']
 
 [tests/functional/cli_root/zfs_property]
 tests = ['zfs_written_property_001_pos']
 tags = ['functional', 'cli_root', 'zfs_property']
 
 [tests/functional/cli_root/zfs_receive]
 tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
     'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos',
     'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg',
     'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos',
     'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos',
     'zfs_receive_016_pos', 'receive-o-x_props_override',
     'receive-o-x_props_aliases',
     'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted',
     'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e',
     'zfs_receive_raw_-d', 'zfs_receive_from_zstd', 'zfs_receive_new_props',
     'zfs_receive_-wR-encrypted-mix', 'zfs_receive_corrective',
     'zfs_receive_compressed_corrective', 'zfs_receive_large_block_corrective']
 tags = ['functional', 'cli_root', 'zfs_receive']
 
 [tests/functional/cli_root/zfs_rename]
 tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos',
     'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos',
     'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg',
     'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg',
     'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child',
     'zfs_rename_to_encrypted', 'zfs_rename_mountpoint', 'zfs_rename_nounmount']
 tags = ['functional', 'cli_root', 'zfs_rename']
 
 [tests/functional/cli_root/zfs_reservation]
 tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
 tags = ['functional', 'cli_root', 'zfs_reservation']
 
 [tests/functional/cli_root/zfs_rollback]
 tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos',
     'zfs_rollback_003_neg', 'zfs_rollback_004_neg']
 tags = ['functional', 'cli_root', 'zfs_rollback']
 
 [tests/functional/cli_root/zfs_send]
 tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
     'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
     'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_encrypted_unloaded',
     'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing']
 tags = ['functional', 'cli_root', 'zfs_send']
 
 [tests/functional/cli_root/zfs_set]
 tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
     'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos',
     'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos',
     'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos',
     'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos',
     'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos',
     'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg',
     'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos',
     'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation',
     'zfs_set_feature_activation', 'zfs_set_nomount']
 tags = ['functional', 'cli_root', 'zfs_set']
 
 [tests/functional/cli_root/zfs_share]
 tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos',
     'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg',
     'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares',
     'zfs_share_after_mount']
 tags = ['functional', 'cli_root', 'zfs_share']
 
 [tests/functional/cli_root/zfs_snapshot]
 tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
     'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg',
     'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg',
     'zfs_snapshot_009_pos']
 tags = ['functional', 'cli_root', 'zfs_snapshot']
 
 [tests/functional/cli_root/zfs_unload-key]
 tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive']
 tags = ['functional', 'cli_root', 'zfs_unload-key']
 
 [tests/functional/cli_root/zfs_unmount]
 tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos',
     'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos',
     'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos',
     'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys']
 tags = ['functional', 'cli_root', 'zfs_unmount']
 
 [tests/functional/cli_root/zfs_unshare]
 tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos',
     'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos',
     'zfs_unshare_007_pos']
 tags = ['functional', 'cli_root', 'zfs_unshare']
 
 [tests/functional/cli_root/zfs_upgrade]
 tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos',
     'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg',
     'zfs_upgrade_007_neg']
 tags = ['functional', 'cli_root', 'zfs_upgrade']
 
 [tests/functional/cli_root/zfs_wait]
 tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt']
 tags = ['functional', 'cli_root', 'zfs_wait']
 
 [tests/functional/cli_root/zhack]
 tests = ['zhack_label_repair_001', 'zhack_label_repair_002',
     'zhack_label_repair_003', 'zhack_label_repair_004']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zhack']
 
 [tests/functional/cli_root/zpool]
 tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors']
 tags = ['functional', 'cli_root', 'zpool']
 
 [tests/functional/cli_root/zpool_add]
 tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos',
     'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg',
     'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos',
     'add-o_ashift', 'add_prop_ashift', 'zpool_add_dryrun_output',
     'zpool_add--allow-ashift-mismatch']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_attach]
 tests = ['zpool_attach_001_neg', 'attach-o_ashift']
 tags = ['functional', 'cli_root', 'zpool_attach']
 
 [tests/functional/cli_root/zpool_clear]
 tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg',
     'zpool_clear_readonly']
 tags = ['functional', 'cli_root', 'zpool_clear']
 
 [tests/functional/cli_root/zpool_create]
 tests = ['zpool_create_001_pos', 'zpool_create_002_pos',
     'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos',
     'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos',
     'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg',
     'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg',
     'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos',
     'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos',
     'zpool_create_023_neg', 'zpool_create_024_pos',
     'zpool_create_encrypted', 'zpool_create_crypt_combos',
     'zpool_create_draid_001_pos', 'zpool_create_draid_002_pos',
     'zpool_create_draid_003_pos', 'zpool_create_draid_004_pos',
     'zpool_create_features_001_pos', 'zpool_create_features_002_pos',
     'zpool_create_features_003_pos', 'zpool_create_features_004_neg',
     'zpool_create_features_005_pos', 'zpool_create_features_006_pos',
     'zpool_create_features_007_pos', 'zpool_create_features_008_pos',
     'zpool_create_features_009_pos', 'create-o_ashift',
     'zpool_create_tempname', 'zpool_create_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_create']
 
 [tests/functional/cli_root/zpool_destroy]
 tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos',
     'zpool_destroy_003_neg']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_destroy']
 
 [tests/functional/cli_root/zpool_detach]
 tests = ['zpool_detach_001_neg']
 tags = ['functional', 'cli_root', 'zpool_detach']
 
 [tests/functional/cli_root/zpool_events]
 tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow',
     'zpool_events_poolname', 'zpool_events_errors', 'zpool_events_duplicates',
     'zpool_events_clear_retained']
 tags = ['functional', 'cli_root', 'zpool_events']
 
 [tests/functional/cli_root/zpool_export]
 tests = ['zpool_export_001_pos', 'zpool_export_002_pos',
     'zpool_export_003_neg', 'zpool_export_004_pos']
 tags = ['functional', 'cli_root', 'zpool_export']
 
 [tests/functional/cli_root/zpool_get]
 tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos',
     'zpool_get_004_neg', 'zpool_get_005_pos', 'vdev_get_001_pos']
 tags = ['functional', 'cli_root', 'zpool_get']
 
 [tests/functional/cli_root/zpool_history]
 tests = ['zpool_history_001_neg', 'zpool_history_002_pos']
 tags = ['functional', 'cli_root', 'zpool_history']
 
 [tests/functional/cli_root/zpool_import]
 tests = ['zpool_import_001_pos', 'zpool_import_002_pos',
     'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos',
     'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos',
     'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg',
     'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos',
     'zpool_import_015_pos', 'zpool_import_016_pos', 'zpool_import_017_pos',
     'zpool_import_features_001_pos', 'zpool_import_features_002_neg',
     'zpool_import_features_003_pos', 'zpool_import_missing_001_pos',
     'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos',
     'zpool_import_rename_001_pos', 'zpool_import_all_001_pos',
     'zpool_import_encrypted', 'zpool_import_encrypted_load',
     'zpool_import_errata3', 'zpool_import_errata4',
     'import_cachefile_device_added',
     'import_cachefile_device_removed',
     'import_cachefile_device_replaced',
     'import_cachefile_mirror_attached',
     'import_cachefile_mirror_detached',
     'import_cachefile_paths_changed',
     'import_cachefile_shared_device',
     'import_devices_missing', 'import_log_missing',
     'import_paths_changed',
     'import_rewind_config_changed',
     'import_rewind_device_replaced',
     'zpool_import_status']
 tags = ['functional', 'cli_root', 'zpool_import']
 timeout = 1200
 
 [tests/functional/cli_root/zpool_labelclear]
 tests = ['zpool_labelclear_active', 'zpool_labelclear_exported',
     'zpool_labelclear_removed', 'zpool_labelclear_valid']
 pre =
 post =
 tags = ['functional', 'cli_root', 'zpool_labelclear']
 
 [tests/functional/cli_root/zpool_initialize]
 tests = ['zpool_initialize_attach_detach_add_remove',
     'zpool_initialize_fault_export_import_online',
     'zpool_initialize_import_export',
     'zpool_initialize_offline_export_import_online',
     'zpool_initialize_online_offline',
     'zpool_initialize_split',
     'zpool_initialize_start_and_cancel_neg',
     'zpool_initialize_start_and_cancel_pos',
     'zpool_initialize_suspend_resume',
     'zpool_initialize_uninit',
     'zpool_initialize_unsupported_vdevs',
     'zpool_initialize_verify_checksums',
     'zpool_initialize_verify_initialized']
 pre =
 tags = ['functional', 'cli_root', 'zpool_initialize']
 
 [tests/functional/cli_root/zpool_offline]
 tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg',
     'zpool_offline_003_pos']
 tags = ['functional', 'cli_root', 'zpool_offline']
 
 [tests/functional/cli_root/zpool_online]
 tests = ['zpool_online_001_pos', 'zpool_online_002_neg']
 tags = ['functional', 'cli_root', 'zpool_online']
 
 [tests/functional/cli_root/zpool_remove]
 tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos',
     'zpool_remove_003_pos']
 tags = ['functional', 'cli_root', 'zpool_remove']
 
 [tests/functional/cli_root/zpool_replace]
 tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift']
 tags = ['functional', 'cli_root', 'zpool_replace']
 
 [tests/functional/cli_root/zpool_resilver]
 tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart',
     'zpool_resilver_concurrent']
 tags = ['functional', 'cli_root', 'zpool_resilver']
 
 [tests/functional/cli_root/zpool_scrub]
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
     'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
     'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
     'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
 tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
     'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
     'user_property_001_pos', 'user_property_002_neg']
 tags = ['functional', 'cli_root', 'zpool_set']
 
 [tests/functional/cli_root/zpool_split]
 tests = ['zpool_split_cliargs', 'zpool_split_devices',
     'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs',
     'zpool_split_resilver', 'zpool_split_indirect',
     'zpool_split_dryrun_output']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/cli_root/zpool_status]
 tests = ['zpool_status_001_pos', 'zpool_status_002_pos',
     'zpool_status_003_pos', 'zpool_status_004_pos',
     'zpool_status_005_pos', 'zpool_status_006_pos',
     'zpool_status_007_pos', 'zpool_status_008_pos',
     'zpool_status_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_status']
 
 [tests/functional/cli_root/zpool_sync]
 tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg']
 tags = ['functional', 'cli_root', 'zpool_sync']
 
 [tests/functional/cli_root/zpool_trim]
 tests = ['zpool_trim_attach_detach_add_remove',
     'zpool_trim_fault_export_import_online',
     'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg',
     'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline',
     'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg',
     'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg',
     'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume',
     'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums',
     'zpool_trim_verify_trimmed']
 tags = ['functional', 'zpool_trim']
 
 [tests/functional/cli_root/zpool_upgrade]
 tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos',
     'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos',
     'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg',
     'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos',
     'zpool_upgrade_009_neg', 'zpool_upgrade_features_001_pos']
 tags = ['functional', 'cli_root', 'zpool_upgrade']
 
 [tests/functional/cli_root/zpool_wait]
 tests = ['zpool_wait_discard', 'zpool_wait_freeing',
     'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel',
     'zpool_wait_initialize_flag', 'zpool_wait_multiple',
     'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel',
     'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag',
     'zpool_wait_usage']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_root/zpool_wait/scan]
 tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
     'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
     'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_user/misc]
 tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg',
     'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg',
     'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg',
     'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg',
     'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg',
     'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg',
     'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg',
     'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg',
     'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg',
     'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg',
     'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg',
     'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg',
     'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg',
     'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos',
     'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege',
     'zilstat_001_pos']
 user =
 tags = ['functional', 'cli_user', 'misc']
 
 [tests/functional/cli_user/zfs_list]
 tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos',
     'zfs_list_004_neg', 'zfs_list_005_neg', 'zfs_list_007_pos',
     'zfs_list_008_neg']
 user =
 tags = ['functional', 'cli_user', 'zfs_list']
 
 [tests/functional/cli_user/zpool_iostat]
 tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos',
     'zpool_iostat_003_neg', 'zpool_iostat_004_pos',
     'zpool_iostat_005_pos', 'zpool_iostat_-c_disable',
     'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_iostat']
 
 [tests/functional/cli_user/zpool_list]
 tests = ['zpool_list_001_pos', 'zpool_list_002_neg']
 user =
 tags = ['functional', 'cli_user', 'zpool_list']
 
 [tests/functional/cli_user/zpool_status]
 tests = ['zpool_status_003_pos', 'zpool_status_-c_disable',
     'zpool_status_-c_homedir', 'zpool_status_-c_searchpath']
 user =
 tags = ['functional', 'cli_user', 'zpool_status']
 
 [tests/functional/compression]
 tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos',
     'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled',
     'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc']
 tags = ['functional', 'compression']
 
 [tests/functional/cp_files]
 tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress']
 tags = ['functional', 'cp_files']
 
 [tests/functional/crtime]
 tests = ['crtime_001_pos' ]
 tags = ['functional', 'crtime']
 
 [tests/functional/ctime]
 tests = ['ctime_001_pos' ]
 tags = ['functional', 'ctime']
 
 [tests/functional/deadman]
 tests = ['deadman_ratelimit', 'deadman_sync', 'deadman_zio']
 pre =
 post =
 tags = ['functional', 'deadman']
 
 [tests/functional/delegate]
 tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos',
     'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos',
     'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg',
     'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg',
     'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos',
     'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos',
     'zfs_unallow_007_neg', 'zfs_unallow_008_neg']
 tags = ['functional', 'delegate']
 
 [tests/functional/exec]
 tests = ['exec_001_pos', 'exec_002_neg']
 tags = ['functional', 'exec']
 
 [tests/functional/fallocate]
 tests = ['fallocate_punch-hole']
 tags = ['functional', 'fallocate']
 
 [tests/functional/features/async_destroy]
 tests = ['async_destroy_001_pos']
 tags = ['functional', 'features', 'async_destroy']
 
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg',
     'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos']
 tags = ['functional', 'features', 'large_dnode']
 
 [tests/functional/grow]
 pre =
 post =
 tests = ['grow_pool_001_pos', 'grow_replicas_001_pos']
 tags = ['functional', 'grow']
 
 [tests/functional/history]
 tests = ['history_001_pos', 'history_002_pos', 'history_003_pos',
     'history_004_pos', 'history_005_neg', 'history_006_neg',
     'history_007_pos', 'history_008_pos', 'history_009_pos',
     'history_010_pos']
 tags = ['functional', 'history']
 
 [tests/functional/hkdf]
 pre =
 post =
 tests = ['hkdf_test']
 tags = ['functional', 'hkdf']
 
 [tests/functional/inheritance]
 tests = ['inherit_001_pos']
 pre =
 tags = ['functional', 'inheritance']
 
 [tests/functional/io]
 tests = ['sync', 'psync', 'posixaio', 'mmap']
 tags = ['functional', 'io']
 
 [tests/functional/inuse]
 tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos']
 post =
 tags = ['functional', 'inuse']
 
 [tests/functional/large_files]
 tests = ['large_files_001_pos', 'large_files_002_pos']
 tags = ['functional', 'large_files']
 
 [tests/functional/limits]
 tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count',
     'snapshot_limit']
 tags = ['functional', 'limits']
 
 [tests/functional/link_count]
 tests = ['link_count_001', 'link_count_root_inode']
 tags = ['functional', 'link_count']
 
 [tests/functional/migration]
 tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos',
     'migration_004_pos', 'migration_005_pos', 'migration_006_pos',
     'migration_007_pos', 'migration_008_pos', 'migration_009_pos',
     'migration_010_pos', 'migration_011_pos', 'migration_012_pos']
 tags = ['functional', 'migration']
 
 [tests/functional/mmap]
 tests = ['mmap_mixed', 'mmap_read_001_pos', 'mmap_seek_001_pos',
     'mmap_sync_001_pos', 'mmap_write_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/mount]
 tests = ['umount_001', 'umountall_001']
 tags = ['functional', 'mount']
 
 [tests/functional/mv_files]
 tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation']
 tags = ['functional', 'mv_files']
 
 [tests/functional/nestedfs]
 tests = ['nestedfs_001_pos']
 tags = ['functional', 'nestedfs']
 
 [tests/functional/no_space]
 tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos',
     'enospc_df', 'enospc_ganging', 'enospc_rm']
 tags = ['functional', 'no_space']
 
 [tests/functional/nopwrite]
 tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative',
     'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync',
     'nopwrite_varying_compression', 'nopwrite_volume']
 tags = ['functional', 'nopwrite']
 
 [tests/functional/online_offline]
 tests = ['online_offline_001_pos', 'online_offline_002_neg',
     'online_offline_003_neg']
 tags = ['functional', 'online_offline']
 
 [tests/functional/pool_checkpoint]
 tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind',
     'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard',
     'checkpoint_discard_busy', 'checkpoint_discard_many',
     'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz',
     'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind',
     'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice',
     'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat']
 tags = ['functional', 'pool_checkpoint']
 timeout = 1800
 
 [tests/functional/pool_names]
 tests = ['pool_names_001_pos', 'pool_names_002_neg']
 pre =
 post =
 tags = ['functional', 'pool_names']
 
 [tests/functional/poolversion]
 tests = ['poolversion_001_pos', 'poolversion_002_pos']
 tags = ['functional', 'poolversion']
 
 [tests/functional/pyzfs]
 tests = ['pyzfs_unittest']
 pre =
 post =
 tags = ['functional', 'pyzfs']
 
 [tests/functional/quota]
 tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos',
          'quota_004_pos', 'quota_005_pos', 'quota_006_neg']
 tags = ['functional', 'quota']
 
 [tests/functional/redacted_send]
 tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted',
     'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes',
     'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones',
     'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative',
     'redacted_origin', 'redacted_panic', 'redacted_props', 'redacted_resume',
     'redacted_size', 'redacted_volume']
 tags = ['functional', 'redacted_send']
 
 [tests/functional/raidz]
 tests = ['raidz_001_neg', 'raidz_002_pos', 'raidz_003_pos', 'raidz_004_pos']
 tags = ['functional', 'raidz']
 
 [tests/functional/redundancy]
 tests = ['redundancy_draid', 'redundancy_draid1', 'redundancy_draid2',
     'redundancy_draid3', 'redundancy_draid_damaged1',
     'redundancy_draid_damaged2', 'redundancy_draid_spare1',
     'redundancy_draid_spare2', 'redundancy_draid_spare3', 'redundancy_mirror',
     'redundancy_raidz', 'redundancy_raidz1', 'redundancy_raidz2',
     'redundancy_raidz3', 'redundancy_stripe']
 tags = ['functional', 'redundancy']
 timeout = 1200
 
 [tests/functional/refquota]
 tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos',
     'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg',
     'refquota_007_neg', 'refquota_008_neg']
 tags = ['functional', 'refquota']
 
 [tests/functional/refreserv]
 tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos',
     'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz',
     'refreserv_raidz']
 tags = ['functional', 'refreserv']
 
 [tests/functional/removal]
 pre =
 tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space',
     'removal_condense_export', 'removal_multiple_indirection',
     'removal_nopwrite', 'removal_remap_deadlists',
     'removal_resume_export', 'removal_sanity', 'removal_with_add',
     'removal_with_create_fs', 'removal_with_dedup',
     'removal_with_errors', 'removal_with_export', 'removal_with_indirect',
     'removal_with_ganging', 'removal_with_faulted',
     'removal_with_remove', 'removal_with_scrub', 'removal_with_send',
     'removal_with_send_recv', 'removal_with_snapshot',
     'removal_with_write', 'removal_with_zdb', 'remove_expanded',
     'remove_mirror', 'remove_mirror_sanity', 'remove_raidz',
     'remove_indirect', 'remove_attach_mirror', 'removal_reservation']
 tags = ['functional', 'removal']
 
 [tests/functional/rename_dirs]
 tests = ['rename_dirs_001_pos']
 tags = ['functional', 'rename_dirs']
 
 [tests/functional/replacement]
 tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
     'attach_resilver', 'detach', 'rebuild_disabled_feature',
     'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
     'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
     'scrub_cancel']
 tags = ['functional', 'replacement']
 
 [tests/functional/reservation]
 tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
     'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos',
     'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos',
     'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos',
     'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos',
     'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos',
     'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg',
     'reservation_022_pos']
 tags = ['functional', 'reservation']
 
 [tests/functional/rootpool]
 tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
 tags = ['functional', 'rootpool']
 
 [tests/functional/rsend]
 tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos',
     'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos',
     'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos',
     'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos',
     'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos',
     'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', 'rsend_025_pos',
     'rsend_026_neg', 'rsend_027_pos', 'rsend_028_neg', 'rsend_029_neg',
     'rsend_030_pos', 'rsend_031_pos', 'send-c_verify_ratio',
     'send-c_verify_contents', 'send-c_props', 'send-c_incremental',
     'send-c_volume', 'send-c_zstream_recompress', 'send-c_zstreamdump',
     'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
     'send-c_mixed_compression', 'send-c_stream_size_estimate',
     'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
     'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_incremental',
     'send_encrypted_freeobjects', 'send_encrypted_hierarchy',
     'send_encrypted_props', 'send_encrypted_truncated_files',
     'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files',
     'send_spill_block', 'send_holds', 'send_hole_birth', 'send_mixed_raw',
     'send-wR_encrypted_zvol', 'send_partial_dataset', 'send_invalid',
     'send_doall', 'send_raw_spill_block', 'send_raw_ashift',
     'send_raw_large_blocks']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
 tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
     'scrub_mirror_003_pos', 'scrub_mirror_004_pos']
 tags = ['functional', 'scrub_mirror']
 
 [tests/functional/slog]
 tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
     'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg',
     'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg',
     'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001',
     'slog_replay_fs_002', 'slog_replay_volume', 'slog_016_pos']
 tags = ['functional', 'slog']
 
 [tests/functional/snapshot]
 tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
     'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos',
     'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
     'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
     'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
     'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
     'snapshot_017_pos', 'snapshot_018_pos']
 tags = ['functional', 'snapshot']
 
 [tests/functional/snapused]
 tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos',
     'snapused_004_pos', 'snapused_005_pos']
 tags = ['functional', 'snapused']
 
 [tests/functional/sparse]
 tests = ['sparse_001_pos']
 tags = ['functional', 'sparse']
 
 [tests/functional/stat]
 tests = ['stat_001_pos']
 tags = ['functional', 'stat']
 
 [tests/functional/suid]
 tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid',
     'suid_write_to_none', 'suid_write_zil_replay']
 tags = ['functional', 'suid']
 
 [tests/functional/trim]
 tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity',
     'trim_integrity', 'trim_config', 'trim_l2arc']
 tags = ['functional', 'trim']
 
 [tests/functional/truncate]
 tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps']
 tags = ['functional', 'truncate']
 
 [tests/functional/upgrade]
 tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool']
 tags = ['functional', 'upgrade']
 
 [tests/functional/userquota]
 tests = [
     'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos',
     'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos',
     'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos',
     'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg',
     'userspace_001_pos', 'userspace_002_pos', 'userspace_encrypted',
     'userspace_send_encrypted', 'userspace_encrypted_13709']
 tags = ['functional', 'userquota']
 
 [tests/functional/vdev_zaps]
 tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos',
     'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos',
     'vdev_zaps_007_pos']
 tags = ['functional', 'vdev_zaps']
 
 [tests/functional/write_dirs]
 tests = ['write_dirs_001_pos', 'write_dirs_002_pos']
 tags = ['functional', 'write_dirs']
 
 [tests/functional/xattr]
 tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos',
     'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg',
     'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos', 'xattr_compat']
 tags = ['functional', 'xattr']
 
 [tests/functional/zvol/zvol_ENOSPC]
 tests = ['zvol_ENOSPC_001_pos']
 tags = ['functional', 'zvol', 'zvol_ENOSPC']
 
 [tests/functional/zvol/zvol_cli]
 tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg']
 tags = ['functional', 'zvol', 'zvol_cli']
 
 [tests/functional/zvol/zvol_misc]
 tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse',
     'zvol_misc_snapdev', 'zvol_misc_trim', 'zvol_misc_volmode', 'zvol_misc_zil']
 tags = ['functional', 'zvol', 'zvol_misc']
 
 [tests/functional/zvol/zvol_stress]
 tests = ['zvol_stress']
 tags = ['functional', 'zvol', 'zvol_stress']
 
 [tests/functional/zvol/zvol_swap]
 tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos']
 tags = ['functional', 'zvol', 'zvol_swap']
 
 [tests/functional/libzfs]
 tests = ['many_fds', 'libzfs_input']
 tags = ['functional', 'libzfs']
 
 [tests/functional/log_spacemap]
 tests = ['log_spacemap_import_logs']
 pre =
 post =
 tags = ['functional', 'log_spacemap']
 
 [tests/functional/l2arc]
 tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos',
     'persist_l2arc_001_pos', 'persist_l2arc_002_pos',
     'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos']
 tags = ['functional', 'l2arc']
 
 [tests/functional/zpool_influxdb]
 tests = ['zpool_influxdb']
 tags = ['functional', 'zpool_influxdb']
diff --git a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in
index de06c7c6e2c1..3370f9b4e350 100755
--- a/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in
+++ b/sys/contrib/openzfs/tests/test-runner/bin/zts-report.py.in
@@ -1,525 +1,527 @@
 #!/usr/bin/env @PYTHON_SHEBANG@
 
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 
 #
 # Copyright (c) 2017 by Delphix. All rights reserved.
 # Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
 #
 # This script must remain compatible with Python 3.6+.
 #
 
 import os
 import re
 import sys
 import argparse
 
 #
 # This script parses the stdout of zfstest, which has this format:
 #
 # Test: /path/to/testa (run as root) [00:00] [PASS]
 # Test: /path/to/testb (run as jkennedy) [00:00] [PASS]
 # Test: /path/to/testc (run as root) [00:00] [FAIL]
 # [...many more results...]
 #
 # Results Summary
 # FAIL      22
 # SKIP      32
 # PASS    1156
 #
 # Running Time:   02:50:31
 # Percent passed: 95.5%
 # Log directory:  /var/tmp/test_results/20180615T205926
 #
 
 #
 # Common generic reasons for a test or test group to be skipped.
 #
 # Some test cases are known to fail in ways which are not harmful or dangerous.
 # In these cases simply mark the test as a known failure until it can be
 # updated and the issue resolved.  Note that it's preferable to open a unique
 # issue on the GitHub issue tracker for each test case failure.
 #
 known_reason = 'Known issue'
 
 #
 # Some tests require that a test user be able to execute the zfs utilities.
 # This may not be possible when testing in-tree due to the default permissions
 # on the user's home directory.  When testing this can be resolved by granting
 # group read access.
 #
 # chmod 0750 $HOME
 #
 exec_reason = 'Test user execute permissions required for utilities'
 
 #
 # Some tests require that the kernel supports renameat2 syscall.
 #
 renameat2_reason = 'Kernel renameat2 support required'
 
 #
 # Some tests require the O_TMPFILE flag which was first introduced in the
 # 3.11 kernel.
 #
 tmpfile_reason = 'Kernel O_TMPFILE support required'
 
 #
 # Some tests require the statx(2) system call on Linux which was first
 # introduced in the 4.11 kernel.
 #
 statx_reason = 'Kernel statx(2) system call required on Linux'
 
 #
 # Some tests require that the lsattr utility support the project id feature.
 #
 project_id_reason = 'lsattr with set/show project ID required'
 
 #
 # Some tests require that the kernel support user namespaces.
 #
 user_ns_reason = 'Kernel user namespace support required'
 
 #
 # Some rewind tests can fail since nothing guarantees that old MOS blocks
 # are not overwritten.  Snapshots protect datasets and data files but not
 # the MOS.  Reasonable efforts are made in the test case to increase the
 # odds that some txgs will have their MOS data left untouched, but it is
 # never a sure thing.
 #
 rewind_reason = 'Arbitrary pool rewind is not guaranteed'
 
 #
 # Some tests require a minimum version of the fio benchmark utility.
 # Older distributions such as CentOS 6.x only provide fio-2.0.13.
 #
 fio_reason = 'Fio v2.3 or newer required'
 
 #
 # Some tests require that the DISKS provided support the discard operation.
 # Normally this is not an issue because loop back devices are used for DISKS
 # and they support discard (TRIM/UNMAP).
 #
 trim_reason = 'DISKS must support discard (TRIM/UNMAP)'
 
 #
 # Some tests on FreeBSD require the fspacectl(2) system call and the
 # truncate(1) utility supporting the -d option.  The system call was first
 # introduced in FreeBSD version 1400032.
 #
 fspacectl_reason = 'fspacectl(2) and truncate -d support required'
 
 #
 # Some tests are not applicable to a platform or need to be updated to operate
 # in the manor required by the platform.  Any tests which are skipped for this
 # reason will be suppressed in the final analysis output.
 #
 na_reason = "Not applicable"
 
 #
 # Some test cases doesn't have all requirements to run on Github actions CI.
 #
 ci_reason = 'CI runner doesn\'t have all requirements'
 
 #
 # Idmapped mount is only supported in kernel version >= 5.12
 #
 idmap_reason = 'Idmapped mount needs kernel 5.12+'
 
 #
 # copy_file_range() is not supported by all kernels
 #
 cfr_reason = 'Kernel copy_file_range support required'
 
 if sys.platform.startswith('freebsd'):
     cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+'
 else:
     cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+'
 
 #
 # These tests are known to fail, thus we use this list to prevent these
 # failures from failing the job as a whole; only unexpected failures
 # bubble up to cause this script to exit with a non-zero exit status.
 #
 # Format: { 'test-name': ['expected result', 'issue-number | reason'] }
 #
 # For each known failure it is recommended to link to a GitHub issue by
 # setting the reason to the issue number.  Alternately, one of the generic
 # reasons listed above can be used.
 #
 known = {
     'casenorm/mixed_none_lookup_ci': ['FAIL', 7633],
     'casenorm/mixed_formd_lookup_ci': ['FAIL', 7633],
     'cli_root/zpool_import/import_rewind_device_replaced':
         ['FAIL', rewind_reason],
     'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason],
     'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason],
     'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053],
     'privilege/setup': ['SKIP', na_reason],
     'refreserv/refreserv_004_pos': ['FAIL', known_reason],
     'rootpool/setup': ['SKIP', na_reason],
     'rsend/rsend_008_pos': ['SKIP', 6066],
     'vdev_zaps/vdev_zaps_007_pos': ['FAIL', known_reason],
 }
 
 if sys.platform.startswith('freebsd'):
     known.update({
         'cli_root/zfs_receive/receive-o-x_props_override':
             ['FAIL', known_reason],
         'cli_root/zpool_resilver/zpool_resilver_concurrent':
             ['SKIP', na_reason],
         'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason],
         'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason],
         'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason],
         'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason],
         'cp_files/cp_files_002_pos': ['SKIP', na_reason],
         'link_count/link_count_001': ['SKIP', na_reason],
         'mmap/mmap_sync_001_pos': ['SKIP', na_reason],
         'rsend/send_raw_ashift': ['SKIP', 14961],
     })
 elif sys.platform.startswith('linux'):
     known.update({
         'casenorm/mixed_formd_lookup': ['FAIL', 7633],
         'casenorm/mixed_formd_delete': ['FAIL', 7633],
         'casenorm/sensitive_formd_lookup': ['FAIL', 7633],
         'casenorm/sensitive_formd_delete': ['FAIL', 7633],
         'removal/removal_with_zdb': ['SKIP', known_reason],
         'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason],
     })
 
 
 #
 # These tests may occasionally fail or be skipped.  We want there failures
 # to be reported but only unexpected failures should bubble up to cause
 # this script to exit with a non-zero exit status.
 #
 # Format: { 'test-name': ['expected result', 'issue-number | reason'] }
 #
 # For each known failure it is recommended to link to a GitHub issue by
 # setting the reason to the issue number.  Alternately, one of the generic
 # reasons listed above can be used.
 #
 maybe = {
     'append/threadsappend_001_pos': ['FAIL', 6136],
     'chattr/setup': ['SKIP', exec_reason],
     'crtime/crtime_001_pos': ['SKIP', statx_reason],
     'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason],
     'cli_root/zfs_destroy/zfs_destroy_dev_removal_condense':
         ['FAIL', known_reason],
     'cli_root/zfs_get/zfs_get_004_pos': ['FAIL', known_reason],
     'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', 5479],
     'cli_root/zfs_rollback/zfs_rollback_001_pos': ['FAIL', known_reason],
     'cli_root/zfs_rollback/zfs_rollback_002_pos': ['FAIL', known_reason],
     'cli_root/zfs_share/zfs_share_concurrent_shares': ['FAIL', known_reason],
     'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason],
     'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason],
     'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason],
     'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', 6145],
     'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', 6839],
     'cli_root/zpool_initialize/zpool_initialize_import_export':
         ['FAIL', 11948],
     'cli_root/zpool_labelclear/zpool_labelclear_removed':
         ['FAIL', known_reason],
     'cli_root/zpool_trim/setup': ['SKIP', trim_reason],
     'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', 6141],
     'delegate/setup': ['SKIP', exec_reason],
     'fallocate/fallocate_punch-hole': ['SKIP', fspacectl_reason],
     'history/history_004_pos': ['FAIL', 7026],
     'history/history_005_neg': ['FAIL', 6680],
     'history/history_006_neg': ['FAIL', 5657],
     'history/history_008_pos': ['FAIL', known_reason],
     'history/history_010_pos': ['SKIP', exec_reason],
     'io/mmap': ['SKIP', fio_reason],
     'largest_pool/largest_pool_001_pos': ['FAIL', known_reason],
     'mmp/mmp_on_uberblocks': ['FAIL', known_reason],
     'pam/setup': ['SKIP', "pamtester might be not available"],
     'pool_checkpoint/checkpoint_discard_busy': ['FAIL', 11946],
     'projectquota/setup': ['SKIP', exec_reason],
     'removal/removal_condense_export': ['FAIL', known_reason],
     'renameat2/setup': ['SKIP', renameat2_reason],
     'reservation/reservation_008_pos': ['FAIL', 7741],
     'reservation/reservation_018_pos': ['FAIL', 5642],
     'snapshot/clone_001_pos': ['FAIL', known_reason],
     'snapshot/snapshot_009_pos': ['FAIL', 7961],
     'snapshot/snapshot_010_pos': ['FAIL', 7961],
     'snapused/snapused_004_pos': ['FAIL', 5513],
     'tmpfile/setup': ['SKIP', tmpfile_reason],
     'trim/setup': ['SKIP', trim_reason],
     'upgrade/upgrade_projectquota_001_pos': ['SKIP', project_id_reason],
     'user_namespace/setup': ['SKIP', user_ns_reason],
     'userquota/setup': ['SKIP', exec_reason],
     'vdev_zaps/vdev_zaps_004_pos': ['FAIL', known_reason],
     'zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos': ['FAIL', 5848],
 }
 
 if sys.platform.startswith('freebsd'):
     maybe.update({
         'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason],
         'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason],
         'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason],
         'delegate/zfs_allow_003_pos': ['FAIL', known_reason],
         'inheritance/inherit_001_pos': ['FAIL', 11829],
         'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622],
         'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623],
         'resilver/resilver_restart_001': ['FAIL', known_reason],
         'snapshot/snapshot_002_pos': ['FAIL', '14831'],
         'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_corner_cases_limited':
             ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason],
         'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason],
         'block_cloning/block_cloning_cross_enc_dataset':
             ['SKIP', cfr_cross_reason],
         'block_cloning/block_cloning_copyfilerange_cross_dataset':
             ['SKIP', cfr_cross_reason]
     })
 elif sys.platform.startswith('linux'):
     maybe.update({
         'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_corner_cases_limited':
             ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason],
         'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason],
         'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason],
         'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason],
         'bclone/bclone_samefs_corner_cases': ['SKIP', cfr_reason],
         'bclone/bclone_samefs_corner_cases_limited': ['SKIP', cfr_reason],
         'bclone/bclone_samefs_data': ['SKIP', cfr_reason],
         'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason],
         'bclone/bclone_samefs_hole': ['SKIP', cfr_reason],
         'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason],
         'block_cloning/block_cloning_clone_mmap_write':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_copyfilerange':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_copyfilerange_cross_dataset':
             ['SKIP', cfr_cross_reason],
         'block_cloning/block_cloning_copyfilerange_fallback':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_copyfilerange_fallback_same_txg':
             ['SKIP', cfr_cross_reason],
         'block_cloning/block_cloning_copyfilerange_partial':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_cross_enc_dataset':
             ['SKIP', cfr_cross_reason],
         'block_cloning/block_cloning_disabled_copyfilerange':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_lwb_buffer_overflow':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_replay':
             ['SKIP', cfr_reason],
         'block_cloning/block_cloning_replay_encrypted':
             ['SKIP', cfr_reason],
+        'block_cloning/block_cloning_rlimit_fsize':
+            ['SKIP', cfr_reason],
         'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason],
         'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason],
         'cp_files/cp_files_002_pos': ['SKIP', cfr_reason],
         'fault/auto_online_002_pos': ['FAIL', 11889],
         'fault/auto_replace_001_pos': ['FAIL', 14851],
         'fault/auto_spare_002_pos': ['FAIL', 11889],
         'fault/auto_spare_multiple': ['FAIL', 11889],
         'fault/auto_spare_shared': ['FAIL', 11889],
         'fault/decompress_fault': ['FAIL', 11889],
         'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason],
         'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason],
         'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason],
         'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason],
         'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason],
         'io/io_uring': ['SKIP', 'io_uring support required'],
         'limits/filesystem_limit': ['SKIP', known_reason],
         'limits/snapshot_limit': ['SKIP', known_reason],
         'mmp/mmp_active_import': ['FAIL', known_reason],
         'mmp/mmp_exported_import': ['FAIL', known_reason],
         'mmp/mmp_inactive_import': ['FAIL', known_reason],
         'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872],
         'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621],
         'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872],
         'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason],
     })
 
 # Not all Github actions runners have scsi_debug module, so we may skip
 #   some tests which use it.
 if os.environ.get('CI') == 'true':
     known.update({
         'cli_root/zpool_expand/zpool_expand_001_pos': ['SKIP', ci_reason],
         'cli_root/zpool_expand/zpool_expand_003_neg': ['SKIP', ci_reason],
         'cli_root/zpool_expand/zpool_expand_005_pos': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/setup': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_001_pos': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_002_pos': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_003_pos': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_004_pos': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_005_pos': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_006_neg': ['SKIP', ci_reason],
         'cli_root/zpool_reopen/zpool_reopen_007_pos': ['SKIP', ci_reason],
         'cli_root/zpool_split/zpool_split_wholedisk': ['SKIP', ci_reason],
         'fault/auto_offline_001_pos': ['SKIP', ci_reason],
         'fault/auto_online_001_pos': ['SKIP', ci_reason],
         'fault/auto_online_002_pos': ['SKIP', ci_reason],
         'fault/auto_replace_001_pos': ['SKIP', ci_reason],
         'fault/auto_replace_002_pos': ['SKIP', ci_reason],
         'fault/auto_spare_ashift': ['SKIP', ci_reason],
         'fault/auto_spare_shared': ['SKIP', ci_reason],
         'fault/suspend_resume_single': ['SKIP', ci_reason],
         'procfs/pool_state': ['SKIP', ci_reason],
     })
 
     maybe.update({
         'events/events_002_pos': ['FAIL', 11546],
     })
 
 
 def process_results(pathname):
     try:
         f = open(pathname)
     except IOError as e:
         print('Error opening file:', e)
         sys.exit(1)
 
     prefix = '/zfs-tests/tests/(?:functional|perf/regression)/'
     pattern = \
         r'^Test(?:\s+\(\S+\))?:' + \
         rf'\s*\S*{prefix}(\S+)' + \
         r'\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]'
     pattern_log = r'^\s*Log directory:\s*(\S*)'
 
     d = {}
     logdir = 'Could not determine log directory.'
     for line in f.readlines():
         m = re.match(pattern, line)
         if m and len(m.groups()) == 4:
             d[m.group(1)] = m.group(4)
             continue
 
         m = re.match(pattern_log, line)
         if m:
             logdir = m.group(1)
 
     return d, logdir
 
 
 class ListMaybesAction(argparse.Action):
     def __init__(self,
                  option_strings,
                  dest="SUPPRESS",
                  default="SUPPRESS",
                  help="list flaky tests and exit"):
         super(ListMaybesAction, self).__init__(
             option_strings=option_strings,
             dest=dest,
             default=default,
             nargs=0,
             help=help)
 
     def __call__(self, parser, namespace, values, option_string=None):
         for test in maybe:
             print(test)
         sys.exit(0)
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Analyze ZTS logs')
     parser.add_argument('logfile')
     parser.add_argument('--list-maybes', action=ListMaybesAction)
     parser.add_argument('--no-maybes', action='store_false', dest='maybes')
     args = parser.parse_args()
 
     results, logdir = process_results(args.logfile)
 
     if not results:
         print("\n\nNo test results were found.")
         print("Log directory:", logdir)
         sys.exit(0)
 
     expected = []
     unexpected = []
     all_maybes = True
 
     for test in list(results.keys()):
         if results[test] == "PASS":
             continue
 
         setup = test.replace(os.path.basename(test), "setup")
         if results[test] == "SKIP" and test != setup:
             if setup in known and known[setup][0] == "SKIP":
                 continue
             if setup in maybe and maybe[setup][0] == "SKIP":
                 continue
 
         if (test in known and results[test] in known[test][0]):
             expected.append(test)
         elif test in maybe and results[test] in maybe[test][0]:
             if results[test] == 'SKIP' or args.maybes:
                 expected.append(test)
             elif not args.maybes:
                 unexpected.append(test)
         else:
             unexpected.append(test)
             all_maybes = False
 
     print("\nTests with results other than PASS that are expected:")
     for test in sorted(expected):
         issue_url = 'https://github.com/openzfs/zfs/issues/'
 
         # Include the reason why the result is expected, given the following:
         # 1. Suppress test results which set the "Not applicable" reason.
         # 2. Numerical reasons are assumed to be GitHub issue numbers.
         # 3. When an entire test group is skipped only report the setup reason.
         if test in known:
             if known[test][1] == na_reason:
                 continue
             elif isinstance(known[test][1], int):
                 expect = f"{issue_url}{known[test][1]}"
             else:
                 expect = known[test][1]
         elif test in maybe:
             if isinstance(maybe[test][1], int):
                 expect = f"{issue_url}{maybe[test][1]}"
             else:
                 expect = maybe[test][1]
         elif setup in known and known[setup][0] == "SKIP" and setup != test:
             continue
         elif setup in maybe and maybe[setup][0] == "SKIP" and setup != test:
             continue
         else:
             expect = "UNKNOWN REASON"
         print(f"    {results[test]} {test} ({expect})")
 
     print("\nTests with result of PASS that are unexpected:")
     for test in sorted(known.keys()):
         # We probably should not be silently ignoring the case
         # where "test" is not in "results".
         if test not in results or results[test] != "PASS":
             continue
         print(f"    {results[test]} {test} (expected {known[test][0]})")
 
     print("\nTests with results other than PASS that are unexpected:")
     for test in sorted(unexpected):
         expect = "PASS" if test not in known else known[test][0]
         print(f"    {results[test]} {test} (expected {expect})")
 
     if len(unexpected) == 0:
         sys.exit(0)
     elif not args.maybes and all_maybes:
         sys.exit(2)
     else:
         sys.exit(1)
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
index b8eed952b90e..a896a21093ca 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@@ -1,2111 +1,2112 @@
 CLEANFILES =
 dist_noinst_DATA =
 include $(top_srcdir)/config/Substfiles.am
 
 
 datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests
 nobase_dist_datadir_zfs_tests_tests_DATA = \
 	perf/nfs-sample.cfg \
 	perf/perf.shlib \
 	\
 	perf/fio/mkfiles.fio \
 	perf/fio/random_reads.fio \
 	perf/fio/random_readwrite.fio \
 	perf/fio/random_readwrite_fixed.fio \
 	perf/fio/random_writes.fio \
 	perf/fio/sequential_reads.fio \
 	perf/fio/sequential_readwrite.fio \
 	perf/fio/sequential_writes.fio
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \
 	perf/regression/random_reads.ksh \
 	perf/regression/random_readwrite.ksh \
 	perf/regression/random_readwrite_fixed.ksh \
 	perf/regression/random_writes.ksh \
 	perf/regression/random_writes_zil.ksh \
 	perf/regression/sequential_reads_arc_cached_clone.ksh \
 	perf/regression/sequential_reads_arc_cached.ksh \
 	perf/regression/sequential_reads_dbuf_cached.ksh \
 	perf/regression/sequential_reads.ksh \
 	perf/regression/sequential_writes.ksh \
 	perf/regression/setup.ksh \
 	\
 	perf/scripts/prefetch_io.sh
 
 # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source:
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #
 # simd and tmpfile are Linux-only and not installed elsewhere
 #
 # C programs are specced in ../Makefile.am above as part of the main Makefile
 
 find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'
 regen:
 	@$(MAKE) -C $(top_builddir) clean
 	@$(MAKE) clean
 	$(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am
 	$(find_common) ! -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am
 	$(find_common)   -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am
 	echo >> Makefile.am
 	echo 'if BUILD_LINUX' >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'endif' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am
 	$(find_common) ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 
 # -- >8 --
 
 nobase_nodist_datadir_zfs_tests_tests_DATA = \
 	functional/pam/utilities.kshlib
 nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \
 	functional/pyzfs/pyzfs_unittest.ksh
 
 SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)
 
 if BUILD_LINUX
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/simd/simd_supported.ksh \
 	functional/tmpfile/cleanup.ksh \
 	functional/tmpfile/setup.ksh
 endif
 
 nobase_dist_datadir_zfs_tests_tests_DATA += \
 	functional/acl/acl.cfg \
 	functional/acl/acl_common.kshlib \
 	functional/alloc_class/alloc_class.cfg \
 	functional/alloc_class/alloc_class.kshlib \
 	functional/atime/atime.cfg \
 	functional/atime/atime_common.kshlib \
 	functional/bclone/bclone.cfg \
 	functional/bclone/bclone_common.kshlib \
 	functional/bclone/bclone_corner_cases.kshlib \
 	functional/block_cloning/block_cloning.kshlib \
 	functional/cache/cache.cfg \
 	functional/cache/cache.kshlib \
 	functional/cachefile/cachefile.cfg \
 	functional/cachefile/cachefile.kshlib \
 	functional/casenorm/casenorm.cfg \
 	functional/casenorm/casenorm.kshlib \
 	functional/channel_program/channel_common.kshlib \
 	functional/channel_program/lua_core/tst.args_to_lua.out \
 	functional/channel_program/lua_core/tst.args_to_lua.zcp \
 	functional/channel_program/lua_core/tst.divide_by_zero.err \
 	functional/channel_program/lua_core/tst.divide_by_zero.zcp \
 	functional/channel_program/lua_core/tst.exists.zcp \
 	functional/channel_program/lua_core/tst.large_prog.out \
 	functional/channel_program/lua_core/tst.large_prog.zcp \
 	functional/channel_program/lua_core/tst.lib_base.lua \
 	functional/channel_program/lua_core/tst.lib_coroutine.lua \
 	functional/channel_program/lua_core/tst.lib_strings.lua \
 	functional/channel_program/lua_core/tst.lib_table.lua \
 	functional/channel_program/lua_core/tst.nested_neg.zcp \
 	functional/channel_program/lua_core/tst.nested_pos.zcp \
 	functional/channel_program/lua_core/tst.recursive.zcp \
 	functional/channel_program/lua_core/tst.return_large.zcp \
 	functional/channel_program/lua_core/tst.return_recursive_table.zcp \
 	functional/channel_program/lua_core/tst.stack_gsub.err \
 	functional/channel_program/lua_core/tst.stack_gsub.zcp \
 	functional/channel_program/lua_core/tst.timeout.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.copy.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.create.zcp \
 	functional/channel_program/synctask_core/tst.get_index_props.out \
 	functional/channel_program/synctask_core/tst.get_index_props.zcp \
 	functional/channel_program/synctask_core/tst.get_number_props.out \
 	functional/channel_program/synctask_core/tst.get_number_props.zcp \
 	functional/channel_program/synctask_core/tst.get_string_props.out \
 	functional/channel_program/synctask_core/tst.get_string_props.zcp \
 	functional/channel_program/synctask_core/tst.promote_conflict.zcp \
 	functional/channel_program/synctask_core/tst.set_props.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_neg.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_rename.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_simple.zcp \
 	functional/checksum/default.cfg \
 	functional/clean_mirror/clean_mirror_common.kshlib \
 	functional/clean_mirror/default.cfg \
 	functional/cli_root/cli_common.kshlib \
 	functional/cli_root/zfs_copies/zfs_copies.cfg \
 	functional/cli_root/zfs_copies/zfs_copies.kshlib \
 	functional/cli_root/zfs_create/properties.kshlib \
 	functional/cli_root/zfs_create/zfs_create.cfg \
 	functional/cli_root/zfs_create/zfs_create_common.kshlib \
 	functional/cli_root/zfs_destroy/zfs_destroy.cfg \
 	functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_list_d.kshlib \
 	functional/cli_root/zfs_jail/jail.conf \
 	functional/cli_root/zfs_load-key/HEXKEY \
 	functional/cli_root/zfs_load-key/PASSPHRASE \
 	functional/cli_root/zfs_load-key/RAWKEY \
 	functional/cli_root/zfs_load-key/zfs_load-key.cfg \
 	functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \
 	functional/cli_root/zfs_mount/zfs_mount.cfg \
 	functional/cli_root/zfs_mount/zfs_mount.kshlib \
 	functional/cli_root/zfs_promote/zfs_promote.cfg \
 	functional/cli_root/zfs_receive/zstd_test_data.txt \
 	functional/cli_root/zfs_rename/zfs_rename.cfg \
 	functional/cli_root/zfs_rename/zfs_rename.kshlib \
 	functional/cli_root/zfs_rollback/zfs_rollback.cfg \
 	functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \
 	functional/cli_root/zfs_send/zfs_send.cfg \
 	functional/cli_root/zfs_set/zfs_set_common.kshlib \
 	functional/cli_root/zfs_share/zfs_share.cfg \
 	functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.kshlib \
 	functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \
 	functional/cli_root/zfs_wait/zfs_wait.kshlib \
 	functional/cli_root/zpool_add/zpool_add.cfg \
 	functional/cli_root/zpool_add/zpool_add.kshlib \
 	functional/cli_root/zpool_clear/zpool_clear.cfg \
 	functional/cli_root/zpool_create/draidcfg.gz \
 	functional/cli_root/zpool_create/zpool_create.cfg \
 	functional/cli_root/zpool_create/zpool_create.shlib \
 	functional/cli_root/zpool_destroy/zpool_destroy.cfg \
 	functional/cli_root/zpool_events/zpool_events.cfg \
 	functional/cli_root/zpool_events/zpool_events.kshlib \
 	functional/cli_root/zpool_expand/zpool_expand.cfg \
 	functional/cli_root/zpool_export/zpool_export.cfg \
 	functional/cli_root/zpool_export/zpool_export.kshlib \
 	functional/cli_root/zpool_get/vdev_get.cfg \
 	functional/cli_root/zpool_get/zpool_get.cfg \
 	functional/cli_root/zpool_get/zpool_get_parsable.cfg \
 	functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \
 	functional/cli_root/zpool_import/zpool_import.cfg \
 	functional/cli_root/zpool_import/zpool_import.kshlib \
 	functional/cli_root/zpool_initialize/zpool_initialize.kshlib \
 	functional/cli_root/zpool_labelclear/labelclear.cfg \
 	functional/cli_root/zpool_remove/zpool_remove.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.shlib \
 	functional/cli_root/zpool_resilver/zpool_resilver.cfg \
 	functional/cli_root/zpool_scrub/zpool_scrub.cfg \
 	functional/cli_root/zpool_split/zpool_split.cfg \
 	functional/cli_root/zpool_trim/zpool_trim.kshlib \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \
 	functional/cli_root/zpool_wait/zpool_wait.kshlib \
 	functional/cli_root/zhack/library.kshlib \
 	functional/cli_user/misc/misc.cfg \
 	functional/cli_user/zfs_list/zfs_list.cfg \
 	functional/cli_user/zfs_list/zfs_list.kshlib \
 	functional/compression/compress.cfg \
 	functional/compression/testpool_zstd.tar.gz \
 	functional/deadman/deadman.cfg \
 	functional/delegate/delegate.cfg \
 	functional/delegate/delegate_common.kshlib \
 	functional/devices/devices.cfg \
 	functional/devices/devices_common.kshlib \
 	functional/events/events.cfg \
 	functional/events/events_common.kshlib \
 	functional/fault/fault.cfg \
 	functional/grow/grow.cfg \
 	functional/history/history.cfg \
 	functional/history/history_common.kshlib \
 	functional/history/i386.migratedpool.DAT.Z \
 	functional/history/i386.orig_history.txt \
 	functional/history/sparc.migratedpool.DAT.Z \
 	functional/history/sparc.orig_history.txt \
 	functional/history/zfs-pool-v4.dat.Z \
 	functional/inheritance/config001.cfg \
 	functional/inheritance/config002.cfg \
 	functional/inheritance/config003.cfg \
 	functional/inheritance/config004.cfg \
 	functional/inheritance/config005.cfg \
 	functional/inheritance/config006.cfg \
 	functional/inheritance/config007.cfg \
 	functional/inheritance/config008.cfg \
 	functional/inheritance/config009.cfg \
 	functional/inheritance/config010.cfg \
 	functional/inheritance/config011.cfg \
 	functional/inheritance/config012.cfg \
 	functional/inheritance/config013.cfg \
 	functional/inheritance/config014.cfg \
 	functional/inheritance/config015.cfg \
 	functional/inheritance/config016.cfg \
 	functional/inheritance/config017.cfg \
 	functional/inheritance/config018.cfg \
 	functional/inheritance/config019.cfg \
 	functional/inheritance/config020.cfg \
 	functional/inheritance/config021.cfg \
 	functional/inheritance/config022.cfg \
 	functional/inheritance/config023.cfg \
 	functional/inheritance/config024.cfg \
 	functional/inheritance/inherit.kshlib \
 	functional/inheritance/README.config \
 	functional/inheritance/README.state \
 	functional/inheritance/state001.cfg \
 	functional/inheritance/state002.cfg \
 	functional/inheritance/state003.cfg \
 	functional/inheritance/state004.cfg \
 	functional/inheritance/state005.cfg \
 	functional/inheritance/state006.cfg \
 	functional/inheritance/state007.cfg \
 	functional/inheritance/state008.cfg \
 	functional/inheritance/state009.cfg \
 	functional/inheritance/state010.cfg \
 	functional/inheritance/state011.cfg \
 	functional/inheritance/state012.cfg \
 	functional/inheritance/state013.cfg \
 	functional/inheritance/state014.cfg \
 	functional/inheritance/state015.cfg \
 	functional/inheritance/state016.cfg \
 	functional/inheritance/state017.cfg \
 	functional/inheritance/state018.cfg \
 	functional/inheritance/state019.cfg \
 	functional/inheritance/state020.cfg \
 	functional/inheritance/state021.cfg \
 	functional/inheritance/state022.cfg \
 	functional/inheritance/state023.cfg \
 	functional/inheritance/state024.cfg \
 	functional/inuse/inuse.cfg \
 	functional/io/io.cfg \
 	functional/l2arc/l2arc.cfg \
 	functional/largest_pool/largest_pool.cfg \
 	functional/migration/migration.cfg \
 	functional/migration/migration.kshlib \
 	functional/mmap/mmap.cfg \
 	functional/mmp/mmp.cfg \
 	functional/mmp/mmp.kshlib \
 	functional/mv_files/mv_files.cfg \
 	functional/mv_files/mv_files_common.kshlib \
 	functional/nopwrite/nopwrite.shlib \
 	functional/no_space/enospc.cfg \
 	functional/online_offline/online_offline.cfg \
 	functional/pool_checkpoint/pool_checkpoint.kshlib \
 	functional/projectquota/projectquota.cfg \
 	functional/projectquota/projectquota_common.kshlib \
 	functional/quota/quota.cfg \
 	functional/quota/quota.kshlib \
 	functional/redacted_send/redacted.cfg \
 	functional/redacted_send/redacted.kshlib \
 	functional/redundancy/redundancy.cfg \
 	functional/redundancy/redundancy.kshlib \
 	functional/refreserv/refreserv.cfg \
 	functional/removal/removal.kshlib \
 	functional/replacement/replacement.cfg \
 	functional/reservation/reservation.cfg \
 	functional/reservation/reservation.shlib \
 	functional/rsend/dedup_encrypted_zvol.bz2 \
 	functional/rsend/dedup_encrypted_zvol.zsend.bz2 \
 	functional/rsend/dedup.zsend.bz2 \
 	functional/rsend/fs.tar.gz \
 	functional/rsend/rsend.cfg \
 	functional/rsend/rsend.kshlib \
 	functional/scrub_mirror/default.cfg \
 	functional/scrub_mirror/scrub_mirror_common.kshlib \
 	functional/slog/slog.cfg \
 	functional/slog/slog.kshlib \
 	functional/snapshot/snapshot.cfg \
 	functional/snapused/snapused.kshlib \
 	functional/sparse/sparse.cfg \
 	functional/trim/trim.cfg \
 	functional/trim/trim.kshlib \
 	functional/truncate/truncate.cfg \
 	functional/upgrade/upgrade_common.kshlib \
 	functional/user_namespace/user_namespace.cfg \
 	functional/user_namespace/user_namespace_common.kshlib \
 	functional/userquota/13709_reproducer.bz2 \
 	functional/userquota/userquota.cfg \
 	functional/userquota/userquota_common.kshlib \
 	functional/vdev_zaps/vdev_zaps.kshlib \
 	functional/xattr/xattr.cfg \
 	functional/xattr/xattr_common.kshlib \
 	functional/zvol/zvol.cfg \
 	functional/zvol/zvol_cli/zvol_cli.cfg \
 	functional/zvol/zvol_common.shlib \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \
 	functional/zvol/zvol_misc/zvol_misc_common.kshlib \
 	functional/zvol/zvol_swap/zvol_swap.cfg \
 	functional/idmap_mount/idmap_mount.cfg \
 	functional/idmap_mount/idmap_mount_common.kshlib
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/acl/off/cleanup.ksh \
 	functional/acl/off/dosmode.ksh \
 	functional/acl/off/posixmode.ksh \
 	functional/acl/off/setup.ksh \
 	functional/acl/posix/cleanup.ksh \
 	functional/acl/posix/posix_001_pos.ksh \
 	functional/acl/posix/posix_002_pos.ksh \
 	functional/acl/posix/posix_003_pos.ksh \
 	functional/acl/posix/posix_004_pos.ksh \
 	functional/acl/posix-sa/cleanup.ksh \
 	functional/acl/posix-sa/posix_001_pos.ksh \
 	functional/acl/posix-sa/posix_002_pos.ksh \
 	functional/acl/posix-sa/posix_003_pos.ksh \
 	functional/acl/posix-sa/posix_004_pos.ksh \
 	functional/acl/posix-sa/setup.ksh \
 	functional/acl/posix/setup.ksh \
 	functional/alloc_class/alloc_class_001_pos.ksh \
 	functional/alloc_class/alloc_class_002_neg.ksh \
 	functional/alloc_class/alloc_class_003_pos.ksh \
 	functional/alloc_class/alloc_class_004_pos.ksh \
 	functional/alloc_class/alloc_class_005_pos.ksh \
 	functional/alloc_class/alloc_class_006_pos.ksh \
 	functional/alloc_class/alloc_class_007_pos.ksh \
 	functional/alloc_class/alloc_class_008_pos.ksh \
 	functional/alloc_class/alloc_class_009_pos.ksh \
 	functional/alloc_class/alloc_class_010_pos.ksh \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
 	functional/alloc_class/alloc_class_014_neg.ksh \
 	functional/alloc_class/alloc_class_015_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
 	functional/append/file_append.ksh \
 	functional/append/threadsappend_001_pos.ksh \
 	functional/append/cleanup.ksh \
 	functional/append/setup.ksh \
 	functional/arc/arcstats_runtime_tuning.ksh \
 	functional/arc/cleanup.ksh \
 	functional/arc/dbufstats_001_pos.ksh \
 	functional/arc/dbufstats_002_pos.ksh \
 	functional/arc/dbufstats_003_pos.ksh \
 	functional/arc/setup.ksh \
 	functional/atime/atime_001_pos.ksh \
 	functional/atime/atime_002_neg.ksh \
 	functional/atime/atime_003_pos.ksh \
 	functional/atime/cleanup.ksh \
 	functional/atime/root_atime_off.ksh \
 	functional/atime/root_atime_on.ksh \
 	functional/atime/root_relatime_on.ksh \
 	functional/atime/setup.ksh \
 	functional/bclone/bclone_crossfs_corner_cases.ksh \
 	functional/bclone/bclone_crossfs_corner_cases_limited.ksh \
 	functional/bclone/bclone_crossfs_data.ksh \
 	functional/bclone/bclone_crossfs_embedded.ksh \
 	functional/bclone/bclone_crossfs_hole.ksh \
 	functional/bclone/bclone_diffprops_all.ksh \
 	functional/bclone/bclone_diffprops_checksum.ksh \
 	functional/bclone/bclone_diffprops_compress.ksh \
 	functional/bclone/bclone_diffprops_copies.ksh \
 	functional/bclone/bclone_diffprops_recordsize.ksh \
 	functional/bclone/bclone_prop_sync.ksh \
 	functional/bclone/bclone_samefs_corner_cases.ksh \
 	functional/bclone/bclone_samefs_corner_cases_limited.ksh \
 	functional/bclone/bclone_samefs_data.ksh \
 	functional/bclone/bclone_samefs_embedded.ksh \
 	functional/bclone/bclone_samefs_hole.ksh \
 	functional/bclone/cleanup.ksh \
 	functional/bclone/setup.ksh \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_cached.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_write.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlone.ksh \
 	functional/block_cloning/block_cloning_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
 	functional/block_cloning/block_cloning_cross_enc_dataset.ksh \
 	functional/block_cloning/block_cloning_replay.ksh \
 	functional/block_cloning/block_cloning_replay_encrypted.ksh \
 	functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \
+	functional/block_cloning/block_cloning_rlimit_fsize.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
 	functional/bootfs/bootfs_004_neg.ksh \
 	functional/bootfs/bootfs_005_neg.ksh \
 	functional/bootfs/bootfs_006_pos.ksh \
 	functional/bootfs/bootfs_007_pos.ksh \
 	functional/bootfs/bootfs_008_pos.ksh \
 	functional/bootfs/cleanup.ksh \
 	functional/bootfs/setup.ksh \
 	functional/btree/btree_negative.ksh \
 	functional/btree/btree_positive.ksh \
 	functional/cache/cache_001_pos.ksh \
 	functional/cache/cache_002_pos.ksh \
 	functional/cache/cache_003_pos.ksh \
 	functional/cache/cache_004_neg.ksh \
 	functional/cache/cache_005_neg.ksh \
 	functional/cache/cache_006_pos.ksh \
 	functional/cache/cache_007_neg.ksh \
 	functional/cache/cache_008_neg.ksh \
 	functional/cache/cache_009_pos.ksh \
 	functional/cache/cache_010_pos.ksh \
 	functional/cache/cache_011_pos.ksh \
 	functional/cache/cache_012_pos.ksh \
 	functional/cache/cleanup.ksh \
 	functional/cachefile/cachefile_001_pos.ksh \
 	functional/cachefile/cachefile_002_pos.ksh \
 	functional/cachefile/cachefile_003_pos.ksh \
 	functional/cachefile/cachefile_004_pos.ksh \
 	functional/cachefile/cleanup.ksh \
 	functional/cachefile/setup.ksh \
 	functional/cache/setup.ksh \
 	functional/casenorm/case_all_values.ksh \
 	functional/casenorm/cleanup.ksh \
 	functional/casenorm/insensitive_formd_delete.ksh \
 	functional/casenorm/insensitive_formd_lookup.ksh \
 	functional/casenorm/insensitive_none_delete.ksh \
 	functional/casenorm/insensitive_none_lookup.ksh \
 	functional/casenorm/mixed_create_failure.ksh \
 	functional/casenorm/mixed_formd_delete.ksh \
 	functional/casenorm/mixed_formd_lookup_ci.ksh \
 	functional/casenorm/mixed_formd_lookup.ksh \
 	functional/casenorm/mixed_none_delete.ksh \
 	functional/casenorm/mixed_none_lookup_ci.ksh \
 	functional/casenorm/mixed_none_lookup.ksh \
 	functional/casenorm/norm_all_values.ksh \
 	functional/casenorm/sensitive_formd_delete.ksh \
 	functional/casenorm/sensitive_formd_lookup.ksh \
 	functional/casenorm/sensitive_none_delete.ksh \
 	functional/casenorm/sensitive_none_lookup.ksh \
 	functional/casenorm/setup.ksh \
 	functional/channel_program/lua_core/cleanup.ksh \
 	functional/channel_program/lua_core/setup.ksh \
 	functional/channel_program/lua_core/tst.args_to_lua.ksh \
 	functional/channel_program/lua_core/tst.divide_by_zero.ksh \
 	functional/channel_program/lua_core/tst.exists.ksh \
 	functional/channel_program/lua_core/tst.integer_illegal.ksh \
 	functional/channel_program/lua_core/tst.integer_overflow.ksh \
 	functional/channel_program/lua_core/tst.language_functions_neg.ksh \
 	functional/channel_program/lua_core/tst.language_functions_pos.ksh \
 	functional/channel_program/lua_core/tst.large_prog.ksh \
 	functional/channel_program/lua_core/tst.libraries.ksh \
 	functional/channel_program/lua_core/tst.memory_limit.ksh \
 	functional/channel_program/lua_core/tst.nested_neg.ksh \
 	functional/channel_program/lua_core/tst.nested_pos.ksh \
 	functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \
 	functional/channel_program/lua_core/tst.recursive_neg.ksh \
 	functional/channel_program/lua_core/tst.recursive_pos.ksh \
 	functional/channel_program/lua_core/tst.return_large.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \
 	functional/channel_program/lua_core/tst.return_recursive_table.ksh \
 	functional/channel_program/lua_core/tst.stack_gsub.ksh \
 	functional/channel_program/lua_core/tst.timeout.ksh \
 	functional/channel_program/synctask_core/cleanup.ksh \
 	functional/channel_program/synctask_core/setup.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.copy.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.create.ksh \
 	functional/channel_program/synctask_core/tst.destroy_fs.ksh \
 	functional/channel_program/synctask_core/tst.destroy_snap.ksh \
 	functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \
 	functional/channel_program/synctask_core/tst.get_index_props.ksh \
 	functional/channel_program/synctask_core/tst.get_mountpoint.ksh \
 	functional/channel_program/synctask_core/tst.get_neg.ksh \
 	functional/channel_program/synctask_core/tst.get_number_props.ksh \
 	functional/channel_program/synctask_core/tst.get_string_props.ksh \
 	functional/channel_program/synctask_core/tst.get_type.ksh \
 	functional/channel_program/synctask_core/tst.get_userquota.ksh \
 	functional/channel_program/synctask_core/tst.get_written.ksh \
 	functional/channel_program/synctask_core/tst.inherit.ksh \
 	functional/channel_program/synctask_core/tst.list_bookmarks.ksh \
 	functional/channel_program/synctask_core/tst.list_children.ksh \
 	functional/channel_program/synctask_core/tst.list_clones.ksh \
 	functional/channel_program/synctask_core/tst.list_holds.ksh \
 	functional/channel_program/synctask_core/tst.list_snapshots.ksh \
 	functional/channel_program/synctask_core/tst.list_system_props.ksh \
 	functional/channel_program/synctask_core/tst.list_user_props.ksh \
 	functional/channel_program/synctask_core/tst.parse_args_neg.ksh \
 	functional/channel_program/synctask_core/tst.promote_conflict.ksh \
 	functional/channel_program/synctask_core/tst.promote_multiple.ksh \
 	functional/channel_program/synctask_core/tst.promote_simple.ksh \
 	functional/channel_program/synctask_core/tst.rollback_mult.ksh \
 	functional/channel_program/synctask_core/tst.rollback_one.ksh \
 	functional/channel_program/synctask_core/tst.set_props.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_neg.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_rename.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_simple.ksh \
 	functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \
 	functional/chattr/chattr_001_pos.ksh \
 	functional/chattr/chattr_002_neg.ksh \
 	functional/chattr/cleanup.ksh \
 	functional/chattr/setup.ksh \
 	functional/checksum/cleanup.ksh \
 	functional/checksum/filetest_001_pos.ksh \
 	functional/checksum/filetest_002_pos.ksh \
 	functional/checksum/run_blake3_test.ksh \
 	functional/checksum/run_edonr_test.ksh \
 	functional/checksum/run_sha2_test.ksh \
 	functional/checksum/run_skein_test.ksh \
 	functional/checksum/setup.ksh \
 	functional/clean_mirror/clean_mirror_001_pos.ksh \
 	functional/clean_mirror/clean_mirror_002_pos.ksh \
 	functional/clean_mirror/clean_mirror_003_pos.ksh \
 	functional/clean_mirror/clean_mirror_004_pos.ksh \
 	functional/clean_mirror/cleanup.ksh \
 	functional/clean_mirror/setup.ksh \
 	functional/cli_root/zdb/zdb_002_pos.ksh \
 	functional/cli_root/zdb/zdb_003_pos.ksh \
 	functional/cli_root/zdb/zdb_004_pos.ksh \
 	functional/cli_root/zdb/zdb_005_pos.ksh \
 	functional/cli_root/zdb/zdb_006_pos.ksh \
 	functional/cli_root/zdb/zdb_args_neg.ksh \
 	functional/cli_root/zdb/zdb_args_pos.ksh \
 	functional/cli_root/zdb/zdb_backup.ksh \
 	functional/cli_root/zdb/zdb_block_size_histogram.ksh \
 	functional/cli_root/zdb/zdb_checksum.ksh \
 	functional/cli_root/zdb/zdb_decompress.ksh \
 	functional/cli_root/zdb/zdb_decompress_zstd.ksh \
 	functional/cli_root/zdb/zdb_display_block.ksh \
 	functional/cli_root/zdb/zdb_encrypted.ksh \
 	functional/cli_root/zdb/zdb_label_checksum.ksh \
 	functional/cli_root/zdb/zdb_object_range_neg.ksh \
 	functional/cli_root/zdb/zdb_object_range_pos.ksh \
 	functional/cli_root/zdb/zdb_objset_id.ksh \
 	functional/cli_root/zdb/zdb_recover_2.ksh \
 	functional/cli_root/zdb/zdb_recover.ksh \
 	functional/cli_root/zfs_bookmark/cleanup.ksh \
 	functional/cli_root/zfs_bookmark/setup.ksh \
 	functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \
 	functional/cli_root/zfs_change-key/cleanup.ksh \
 	functional/cli_root/zfs_change-key/setup.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \
 	functional/cli_root/zfs/cleanup.ksh \
 	functional/cli_root/zfs_clone/cleanup.ksh \
 	functional/cli_root/zfs_clone/setup.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \
 	functional/cli_root/zfs_copies/cleanup.ksh \
 	functional/cli_root/zfs_copies/setup.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \
 	functional/cli_root/zfs_create/cleanup.ksh \
 	functional/cli_root/zfs_create/setup.ksh \
 	functional/cli_root/zfs_create/zfs_create_001_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_002_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_003_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_004_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_005_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_006_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_007_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_008_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_009_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_010_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_011_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_012_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_013_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_014_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \
 	functional/cli_root/zfs_create/zfs_create_dryrun.ksh \
 	functional/cli_root/zfs_create/zfs_create_encrypted.ksh \
 	functional/cli_root/zfs_create/zfs_create_nomount.ksh \
 	functional/cli_root/zfs_create/zfs_create_verbose.ksh \
 	functional/cli_root/zfs_destroy/cleanup.ksh \
 	functional/cli_root/zfs_destroy/setup.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \
 	functional/cli_root/zfs_diff/cleanup.ksh \
 	functional/cli_root/zfs_diff/setup.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_changes.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_types.ksh \
 	functional/cli_root/zfs_get/cleanup.ksh \
 	functional/cli_root/zfs_get/setup.ksh \
 	functional/cli_root/zfs_get/zfs_get_001_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_002_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_003_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_004_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_005_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_006_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_007_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_008_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_009_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_010_neg.ksh \
 	functional/cli_root/zfs_ids_to_path/cleanup.ksh \
 	functional/cli_root/zfs_ids_to_path/setup.ksh \
 	functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \
 	functional/cli_root/zfs_inherit/cleanup.ksh \
 	functional/cli_root/zfs_inherit/setup.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \
 	functional/cli_root/zfs_jail/cleanup.ksh \
 	functional/cli_root/zfs_jail/setup.ksh \
 	functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \
 	functional/cli_root/zfs_load-key/cleanup.ksh \
 	functional/cli_root/zfs_load-key/setup.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \
 	functional/cli_root/zfs_mount/cleanup.ksh \
 	functional/cli_root/zfs_mount/setup.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
 	functional/cli_root/zfs_program/cleanup.ksh \
 	functional/cli_root/zfs_program/setup.ksh \
 	functional/cli_root/zfs_program/zfs_program_json.ksh \
 	functional/cli_root/zfs_promote/cleanup.ksh \
 	functional/cli_root/zfs_promote/setup.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \
 	functional/cli_root/zfs_property/cleanup.ksh \
 	functional/cli_root/zfs_property/setup.ksh \
 	functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \
 	functional/cli_root/zfs_receive/cleanup.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \
 	functional/cli_root/zfs_receive/setup.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-e.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \
 	functional/cli_root/zfs_rename/cleanup.ksh \
 	functional/cli_root/zfs_rename/setup.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \
 	functional/cli_root/zfs_reservation/cleanup.ksh \
 	functional/cli_root/zfs_reservation/setup.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \
 	functional/cli_root/zfs_rollback/cleanup.ksh \
 	functional/cli_root/zfs_rollback/setup.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \
 	functional/cli_root/zfs_send/cleanup.ksh \
 	functional/cli_root/zfs_send/setup.ksh \
 	functional/cli_root/zfs_send/zfs_send_001_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_002_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_003_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_004_neg.ksh \
 	functional/cli_root/zfs_send/zfs_send_005_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_006_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_007_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send-b.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \
 	functional/cli_root/zfs_send/zfs_send_raw.ksh \
 	functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \
 	functional/cli_root/zfs_send/zfs_send_sparse.ksh \
 	functional/cli_root/zfs_set/cache_001_pos.ksh \
 	functional/cli_root/zfs_set/cache_002_neg.ksh \
 	functional/cli_root/zfs_set/canmount_001_pos.ksh \
 	functional/cli_root/zfs_set/canmount_002_pos.ksh \
 	functional/cli_root/zfs_set/canmount_003_pos.ksh \
 	functional/cli_root/zfs_set/canmount_004_pos.ksh \
 	functional/cli_root/zfs_set/checksum_001_pos.ksh \
 	functional/cli_root/zfs_set/cleanup.ksh \
 	functional/cli_root/zfs_set/compression_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_002_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_003_pos.ksh \
 	functional/cli_root/zfs_set/onoffs_001_pos.ksh \
 	functional/cli_root/zfs_set/property_alias_001_pos.ksh \
 	functional/cli_root/zfs_set/readonly_001_pos.ksh \
 	functional/cli_root/zfs_set/reservation_001_neg.ksh \
 	functional/cli_root/zfs_set/ro_props_001_pos.ksh \
 	functional/cli_root/zfs_set/setup.ksh \
 	functional/cli_root/zfs_set/share_mount_001_neg.ksh \
 	functional/cli_root/zfs_set/snapdir_001_pos.ksh \
 	functional/cli_root/zfs/setup.ksh \
 	functional/cli_root/zfs_set/user_property_001_pos.ksh \
 	functional/cli_root/zfs_set/user_property_002_pos.ksh \
 	functional/cli_root/zfs_set/user_property_003_neg.ksh \
 	functional/cli_root/zfs_set/user_property_004_pos.ksh \
 	functional/cli_root/zfs_set/version_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_002_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_003_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \
 	functional/cli_root/zfs_set/zfs_set_keylocation.ksh \
 	functional/cli_root/zfs_set/zfs_set_nomount.ksh \
 	functional/cli_root/zfs_share/cleanup.ksh \
 	functional/cli_root/zfs_share/setup.ksh \
 	functional/cli_root/zfs_share/zfs_share_001_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_002_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_003_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_004_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_005_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_006_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_007_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_008_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_009_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_010_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_011_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_012_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_013_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \
 	functional/cli_root/zfs_share/zfs_share_after_mount.ksh \
 	functional/cli_root/zfs_snapshot/cleanup.ksh \
 	functional/cli_root/zfs_snapshot/setup.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \
 	functional/cli_root/zfs_sysfs/cleanup.ksh \
 	functional/cli_root/zfs_sysfs/setup.ksh \
 	functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \
 	functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \
 	functional/cli_root/zfs_unload-key/cleanup.ksh \
 	functional/cli_root/zfs_unload-key/setup.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \
 	functional/cli_root/zfs_unmount/cleanup.ksh \
 	functional/cli_root/zfs_unmount/setup.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \
 	functional/cli_root/zfs_unshare/cleanup.ksh \
 	functional/cli_root/zfs_unshare/setup.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \
 	functional/cli_root/zfs_upgrade/cleanup.ksh \
 	functional/cli_root/zfs_upgrade/setup.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \
 	functional/cli_root/zfs_wait/cleanup.ksh \
 	functional/cli_root/zfs_wait/setup.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \
 	functional/cli_root/zfs/zfs_001_neg.ksh \
 	functional/cli_root/zfs/zfs_002_pos.ksh \
 	functional/cli_root/zfs/zfs_003_neg.ksh \
 	functional/cli_root/zhack/zhack_label_repair_001.ksh \
 	functional/cli_root/zhack/zhack_label_repair_002.ksh \
 	functional/cli_root/zhack/zhack_label_repair_003.ksh \
 	functional/cli_root/zhack/zhack_label_repair_004.ksh \
 	functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \
 	functional/cli_root/zpool_add/add-o_ashift.ksh \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
 	functional/cli_root/zpool_add/zpool_add--allow-ashift-mismatch.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_004_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_005_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_006_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_007_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_008_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_009_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_010_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \
 	functional/cli_root/zpool_attach/attach-o_ashift.ksh \
 	functional/cli_root/zpool_attach/cleanup.ksh \
 	functional/cli_root/zpool_attach/setup.ksh \
 	functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \
 	functional/cli_root/zpool/cleanup.ksh \
 	functional/cli_root/zpool_clear/cleanup.ksh \
 	functional/cli_root/zpool_clear/setup.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \
 	functional/cli_root/zpool_create/cleanup.ksh \
 	functional/cli_root/zpool_create/create-o_ashift.ksh \
 	functional/cli_root/zpool_create/setup.ksh \
 	functional/cli_root/zpool_create/zpool_create_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_007_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_009_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_010_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_011_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_012_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_014_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_015_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_016_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_017_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_018_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_019_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_020_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_021_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_022_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_023_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_024_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \
 	functional/cli_root/zpool_create/zpool_create_encrypted.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_tempname.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \
 	functional/cli_root/zpool_detach/cleanup.ksh \
 	functional/cli_root/zpool_detach/setup.ksh \
 	functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \
 	functional/cli_root/zpool_events/cleanup.ksh \
 	functional/cli_root/zpool_events/setup.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \
 	functional/cli_root/zpool_events/zpool_events_cliargs.ksh \
 	functional/cli_root/zpool_events/zpool_events_duplicates.ksh \
 	functional/cli_root/zpool_events/zpool_events_errors.ksh \
 	functional/cli_root/zpool_events/zpool_events_follow.ksh \
 	functional/cli_root/zpool_events/zpool_events_poolname.ksh \
 	functional/cli_root/zpool_expand/cleanup.ksh \
 	functional/cli_root/zpool_expand/setup.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \
 	functional/cli_root/zpool_export/cleanup.ksh \
 	functional/cli_root/zpool_export/setup.ksh \
 	functional/cli_root/zpool_export/zpool_export_001_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_002_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_003_neg.ksh \
 	functional/cli_root/zpool_export/zpool_export_004_pos.ksh \
 	functional/cli_root/zpool_get/cleanup.ksh \
 	functional/cli_root/zpool_get/setup.ksh \
 	functional/cli_root/zpool_get/vdev_get_001_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_001_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_002_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_003_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_004_neg.ksh \
 	functional/cli_root/zpool_get/zpool_get_005_pos.ksh \
 	functional/cli_root/zpool_history/cleanup.ksh \
 	functional/cli_root/zpool_history/setup.ksh \
 	functional/cli_root/zpool_history/zpool_history_001_neg.ksh \
 	functional/cli_root/zpool_history/zpool_history_002_pos.ksh \
 	functional/cli_root/zpool_import/cleanup.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_added.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \
 	functional/cli_root/zpool_import/import_devices_missing.ksh \
 	functional/cli_root/zpool_import/import_log_missing.ksh \
 	functional/cli_root/zpool_import/import_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_config_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \
 	functional/cli_root/zpool_import/setup.ksh \
 	functional/cli_root/zpool_import/zpool_import_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_004_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_005_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_006_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_007_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_008_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_009_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_010_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_011_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_012_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_013_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_014_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_015_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_016_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_017_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata3.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata4.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \
 	functional/cli_root/zpool_initialize/cleanup.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \
 	functional/cli_root/zpool_offline/cleanup.ksh \
 	functional/cli_root/zpool_offline/setup.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \
 	functional/cli_root/zpool_online/cleanup.ksh \
 	functional/cli_root/zpool_online/setup.ksh \
 	functional/cli_root/zpool_online/zpool_online_001_pos.ksh \
 	functional/cli_root/zpool_online/zpool_online_002_neg.ksh \
 	functional/cli_root/zpool_remove/cleanup.ksh \
 	functional/cli_root/zpool_remove/setup.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \
 	functional/cli_root/zpool_reopen/cleanup.ksh \
 	functional/cli_root/zpool_reopen/setup.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \
 	functional/cli_root/zpool_replace/cleanup.ksh \
 	functional/cli_root/zpool_replace/replace-o_ashift.ksh \
 	functional/cli_root/zpool_replace/replace_prop_ashift.ksh \
 	functional/cli_root/zpool_replace/setup.ksh \
 	functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \
 	functional/cli_root/zpool_resilver/cleanup.ksh \
 	functional/cli_root/zpool_resilver/setup.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
 	functional/cli_root/zpool_scrub/cleanup.ksh \
 	functional/cli_root/zpool_scrub/setup.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \
 	functional/cli_root/zpool_set/cleanup.ksh \
 	functional/cli_root/zpool_set/setup.ksh \
 	functional/cli_root/zpool/setup.ksh \
 	functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_common.kshlib \
 	functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_003_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_ashift.ksh \
 	functional/cli_root/zpool_set/user_property_001_pos.ksh \
 	functional/cli_root/zpool_set/user_property_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_features.ksh \
 	functional/cli_root/zpool_split/cleanup.ksh \
 	functional/cli_root/zpool_split/setup.ksh \
 	functional/cli_root/zpool_split/zpool_split_cliargs.ksh \
 	functional/cli_root/zpool_split/zpool_split_devices.ksh \
 	functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \
 	functional/cli_root/zpool_split/zpool_split_encryption.ksh \
 	functional/cli_root/zpool_split/zpool_split_indirect.ksh \
 	functional/cli_root/zpool_split/zpool_split_props.ksh \
 	functional/cli_root/zpool_split/zpool_split_resilver.ksh \
 	functional/cli_root/zpool_split/zpool_split_vdevs.ksh \
 	functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \
 	functional/cli_root/zpool_status/cleanup.ksh \
 	functional/cli_root/zpool_status/setup.ksh \
 	functional/cli_root/zpool_status/zpool_status_001_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_002_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_004_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 	functional/cli_root/zpool_sync/cleanup.ksh \
 	functional/cli_root/zpool_sync/setup.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \
 	functional/cli_root/zpool_trim/cleanup.ksh \
 	functional/cli_root/zpool_trim/setup.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_partial.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_secure.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_split.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \
 	functional/cli_root/zpool_upgrade/cleanup.ksh \
 	functional/cli_root/zpool_upgrade/setup.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \
 	functional/cli_root/zpool_wait/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/setup.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \
 	functional/cli_root/zpool_wait/setup.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_discard.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_usage.ksh \
 	functional/cli_root/zpool/zpool_001_neg.ksh \
 	functional/cli_root/zpool/zpool_002_pos.ksh \
 	functional/cli_root/zpool/zpool_003_pos.ksh \
 	functional/cli_root/zpool/zpool_colors.ksh \
 	functional/cli_user/misc/arcstat_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_002_neg.ksh \
 	functional/cli_user/misc/zilstat_001_pos.ksh \
 	functional/cli_user/misc/cleanup.ksh \
 	functional/cli_user/misc/setup.ksh \
 	functional/cli_user/misc/zdb_001_neg.ksh \
 	functional/cli_user/misc/zfs_001_neg.ksh \
 	functional/cli_user/misc/zfs_allow_001_neg.ksh \
 	functional/cli_user/misc/zfs_clone_001_neg.ksh \
 	functional/cli_user/misc/zfs_create_001_neg.ksh \
 	functional/cli_user/misc/zfs_destroy_001_neg.ksh \
 	functional/cli_user/misc/zfs_get_001_neg.ksh \
 	functional/cli_user/misc/zfs_inherit_001_neg.ksh \
 	functional/cli_user/misc/zfs_mount_001_neg.ksh \
 	functional/cli_user/misc/zfs_promote_001_neg.ksh \
 	functional/cli_user/misc/zfs_receive_001_neg.ksh \
 	functional/cli_user/misc/zfs_rename_001_neg.ksh \
 	functional/cli_user/misc/zfs_rollback_001_neg.ksh \
 	functional/cli_user/misc/zfs_send_001_neg.ksh \
 	functional/cli_user/misc/zfs_set_001_neg.ksh \
 	functional/cli_user/misc/zfs_share_001_neg.ksh \
 	functional/cli_user/misc/zfs_snapshot_001_neg.ksh \
 	functional/cli_user/misc/zfs_unallow_001_neg.ksh \
 	functional/cli_user/misc/zfs_unmount_001_neg.ksh \
 	functional/cli_user/misc/zfs_unshare_001_neg.ksh \
 	functional/cli_user/misc/zfs_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_001_neg.ksh \
 	functional/cli_user/misc/zpool_add_001_neg.ksh \
 	functional/cli_user/misc/zpool_attach_001_neg.ksh \
 	functional/cli_user/misc/zpool_clear_001_neg.ksh \
 	functional/cli_user/misc/zpool_create_001_neg.ksh \
 	functional/cli_user/misc/zpool_destroy_001_neg.ksh \
 	functional/cli_user/misc/zpool_detach_001_neg.ksh \
 	functional/cli_user/misc/zpool_export_001_neg.ksh \
 	functional/cli_user/misc/zpool_get_001_neg.ksh \
 	functional/cli_user/misc/zpool_history_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_002_neg.ksh \
 	functional/cli_user/misc/zpool_offline_001_neg.ksh \
 	functional/cli_user/misc/zpool_online_001_neg.ksh \
 	functional/cli_user/misc/zpool_remove_001_neg.ksh \
 	functional/cli_user/misc/zpool_replace_001_neg.ksh \
 	functional/cli_user/misc/zpool_scrub_001_neg.ksh \
 	functional/cli_user/misc/zpool_set_001_neg.ksh \
 	functional/cli_user/misc/zpool_status_001_neg.ksh \
 	functional/cli_user/misc/zpool_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_wait_privilege.ksh \
 	functional/cli_user/zfs_list/cleanup.ksh \
 	functional/cli_user/zfs_list/setup.ksh \
 	functional/cli_user/zfs_list/zfs_list_001_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_002_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_003_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_004_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_005_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_007_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_008_neg.ksh \
 	functional/cli_user/zpool_iostat/cleanup.ksh \
 	functional/cli_user/zpool_iostat/setup.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \
 	functional/cli_user/zpool_list/cleanup.ksh \
 	functional/cli_user/zpool_list/setup.ksh \
 	functional/cli_user/zpool_list/zpool_list_001_pos.ksh \
 	functional/cli_user/zpool_list/zpool_list_002_neg.ksh \
 	functional/cli_user/zpool_status/cleanup.ksh \
 	functional/cli_user/zpool_status/setup.ksh \
 	functional/cli_user/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \
 	functional/compression/cleanup.ksh \
 	functional/compression/compress_001_pos.ksh \
 	functional/compression/compress_002_pos.ksh \
 	functional/compression/compress_003_pos.ksh \
 	functional/compression/compress_004_pos.ksh \
 	functional/compression/compress_zstd_bswap.ksh \
 	functional/compression/l2arc_compressed_arc_disabled.ksh \
 	functional/compression/l2arc_compressed_arc.ksh \
 	functional/compression/l2arc_encrypted.ksh \
 	functional/compression/l2arc_encrypted_no_compressed_arc.ksh \
 	functional/compression/setup.ksh \
 	functional/cp_files/cleanup.ksh \
 	functional/cp_files/cp_files_001_pos.ksh \
 	functional/cp_files/cp_files_002_pos.ksh \
 	functional/cp_files/cp_stress.ksh \
 	functional/cp_files/setup.ksh \
 	functional/crtime/cleanup.ksh \
 	functional/crtime/crtime_001_pos.ksh \
 	functional/crtime/setup.ksh \
 	functional/ctime/cleanup.ksh \
 	functional/ctime/ctime_001_pos.ksh \
 	functional/ctime/setup.ksh \
 	functional/deadman/deadman_ratelimit.ksh \
 	functional/deadman/deadman_sync.ksh \
 	functional/deadman/deadman_zio.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
 	functional/delegate/zfs_allow_001_pos.ksh \
 	functional/delegate/zfs_allow_002_pos.ksh \
 	functional/delegate/zfs_allow_003_pos.ksh \
 	functional/delegate/zfs_allow_004_pos.ksh \
 	functional/delegate/zfs_allow_005_pos.ksh \
 	functional/delegate/zfs_allow_006_pos.ksh \
 	functional/delegate/zfs_allow_007_pos.ksh \
 	functional/delegate/zfs_allow_008_pos.ksh \
 	functional/delegate/zfs_allow_009_neg.ksh \
 	functional/delegate/zfs_allow_010_pos.ksh \
 	functional/delegate/zfs_allow_011_neg.ksh \
 	functional/delegate/zfs_allow_012_neg.ksh \
 	functional/delegate/zfs_unallow_001_pos.ksh \
 	functional/delegate/zfs_unallow_002_pos.ksh \
 	functional/delegate/zfs_unallow_003_pos.ksh \
 	functional/delegate/zfs_unallow_004_pos.ksh \
 	functional/delegate/zfs_unallow_005_pos.ksh \
 	functional/delegate/zfs_unallow_006_pos.ksh \
 	functional/delegate/zfs_unallow_007_neg.ksh \
 	functional/delegate/zfs_unallow_008_neg.ksh \
 	functional/devices/cleanup.ksh \
 	functional/devices/devices_001_pos.ksh \
 	functional/devices/devices_002_neg.ksh \
 	functional/devices/devices_003_pos.ksh \
 	functional/devices/setup.ksh \
 	functional/dos_attributes/cleanup.ksh \
 	functional/dos_attributes/read_dos_attrs_001.ksh \
 	functional/dos_attributes/setup.ksh \
 	functional/dos_attributes/write_dos_attrs_001.ksh \
 	functional/events/cleanup.ksh \
 	functional/events/events_001_pos.ksh \
 	functional/events/events_002_pos.ksh \
 	functional/events/setup.ksh \
 	functional/events/zed_cksum_config.ksh \
 	functional/events/zed_cksum_reported.ksh \
 	functional/events/zed_fd_spill.ksh \
 	functional/events/zed_io_config.ksh \
 	functional/events/zed_rc_filter.ksh \
 	functional/events/zed_slow_io.ksh \
 	functional/events/zed_slow_io_many_vdevs.ksh \
 	functional/exec/cleanup.ksh \
 	functional/exec/exec_001_pos.ksh \
 	functional/exec/exec_002_neg.ksh \
 	functional/exec/setup.ksh \
 	functional/fadvise/cleanup.ksh \
 	functional/fadvise/fadvise_sequential.ksh \
 	functional/fadvise/setup.ksh \
 	functional/fallocate/cleanup.ksh \
 	functional/fallocate/fallocate_prealloc.ksh \
 	functional/fallocate/fallocate_punch-hole.ksh \
 	functional/fallocate/fallocate_zero-range.ksh \
 	functional/fallocate/setup.ksh \
 	functional/fault/auto_offline_001_pos.ksh \
 	functional/fault/auto_online_001_pos.ksh \
 	functional/fault/auto_online_002_pos.ksh \
 	functional/fault/auto_replace_001_pos.ksh \
 	functional/fault/auto_replace_002_pos.ksh \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
 	functional/fault/cleanup.ksh \
 	functional/fault/decompress_fault.ksh \
 	functional/fault/decrypt_fault.ksh \
 	functional/fault/scrub_after_resilver.ksh \
 	functional/fault/suspend_resume_single.ksh \
 	functional/fault/setup.ksh \
 	functional/fault/zpool_status_-s.ksh \
 	functional/features/async_destroy/async_destroy_001_pos.ksh \
 	functional/features/async_destroy/cleanup.ksh \
 	functional/features/async_destroy/setup.ksh \
 	functional/features/large_dnode/cleanup.ksh \
 	functional/features/large_dnode/large_dnode_001_pos.ksh \
 	functional/features/large_dnode/large_dnode_002_pos.ksh \
 	functional/features/large_dnode/large_dnode_003_pos.ksh \
 	functional/features/large_dnode/large_dnode_004_neg.ksh \
 	functional/features/large_dnode/large_dnode_005_pos.ksh \
 	functional/features/large_dnode/large_dnode_006_pos.ksh \
 	functional/features/large_dnode/large_dnode_007_neg.ksh \
 	functional/features/large_dnode/large_dnode_008_pos.ksh \
 	functional/features/large_dnode/large_dnode_009_pos.ksh \
 	functional/features/large_dnode/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
 	functional/history/cleanup.ksh \
 	functional/history/history_001_pos.ksh \
 	functional/history/history_002_pos.ksh \
 	functional/history/history_003_pos.ksh \
 	functional/history/history_004_pos.ksh \
 	functional/history/history_005_neg.ksh \
 	functional/history/history_006_neg.ksh \
 	functional/history/history_007_pos.ksh \
 	functional/history/history_008_pos.ksh \
 	functional/history/history_009_pos.ksh \
 	functional/history/history_010_pos.ksh \
 	functional/history/setup.ksh \
 	functional/inheritance/cleanup.ksh \
 	functional/inheritance/inherit_001_pos.ksh \
 	functional/inuse/inuse_001_pos.ksh \
 	functional/inuse/inuse_003_pos.ksh \
 	functional/inuse/inuse_004_pos.ksh \
 	functional/inuse/inuse_005_pos.ksh \
 	functional/inuse/inuse_006_pos.ksh \
 	functional/inuse/inuse_007_pos.ksh \
 	functional/inuse/inuse_008_pos.ksh \
 	functional/inuse/inuse_009_pos.ksh \
 	functional/inuse/setup.ksh \
 	functional/io/cleanup.ksh \
 	functional/io/io_uring.ksh \
 	functional/io/libaio.ksh \
 	functional/io/mmap.ksh \
 	functional/io/posixaio.ksh \
 	functional/io/psync.ksh \
 	functional/io/setup.ksh \
 	functional/io/sync.ksh \
 	functional/l2arc/cleanup.ksh \
 	functional/l2arc/l2arc_arcstats_pos.ksh \
 	functional/l2arc/l2arc_l2miss_pos.ksh \
 	functional/l2arc/l2arc_mfuonly_pos.ksh \
 	functional/l2arc/persist_l2arc_001_pos.ksh \
 	functional/l2arc/persist_l2arc_002_pos.ksh \
 	functional/l2arc/persist_l2arc_003_neg.ksh \
 	functional/l2arc/persist_l2arc_004_pos.ksh \
 	functional/l2arc/persist_l2arc_005_pos.ksh \
 	functional/l2arc/setup.ksh \
 	functional/large_files/cleanup.ksh \
 	functional/large_files/large_files_001_pos.ksh \
 	functional/large_files/large_files_002_pos.ksh \
 	functional/large_files/setup.ksh \
 	functional/largest_pool/largest_pool_001_pos.ksh \
 	functional/libzfs/cleanup.ksh \
 	functional/libzfs/libzfs_input.ksh \
 	functional/libzfs/setup.ksh \
 	functional/limits/cleanup.ksh \
 	functional/limits/filesystem_count.ksh \
 	functional/limits/filesystem_limit.ksh \
 	functional/limits/setup.ksh \
 	functional/limits/snapshot_count.ksh \
 	functional/limits/snapshot_limit.ksh \
 	functional/link_count/cleanup.ksh \
 	functional/link_count/link_count_001.ksh \
 	functional/link_count/link_count_root_inode.ksh \
 	functional/link_count/setup.ksh \
 	functional/log_spacemap/log_spacemap_import_logs.ksh \
 	functional/migration/cleanup.ksh \
 	functional/migration/migration_001_pos.ksh \
 	functional/migration/migration_002_pos.ksh \
 	functional/migration/migration_003_pos.ksh \
 	functional/migration/migration_004_pos.ksh \
 	functional/migration/migration_005_pos.ksh \
 	functional/migration/migration_006_pos.ksh \
 	functional/migration/migration_007_pos.ksh \
 	functional/migration/migration_008_pos.ksh \
 	functional/migration/migration_009_pos.ksh \
 	functional/migration/migration_010_pos.ksh \
 	functional/migration/migration_011_pos.ksh \
 	functional/migration/migration_012_pos.ksh \
 	functional/migration/setup.ksh \
 	functional/mmap/cleanup.ksh \
 	functional/mmap/mmap_libaio_001_pos.ksh \
 	functional/mmap/mmap_mixed.ksh \
 	functional/mmap/mmap_read_001_pos.ksh \
 	functional/mmap/mmap_seek_001_pos.ksh \
 	functional/mmap/mmap_sync_001_pos.ksh \
 	functional/mmap/mmap_write_001_pos.ksh \
 	functional/mmap/setup.ksh \
 	functional/mmp/cleanup.ksh \
 	functional/mmp/mmp_active_import.ksh \
 	functional/mmp/mmp_exported_import.ksh \
 	functional/mmp/mmp_hostid.ksh \
 	functional/mmp/mmp_inactive_import.ksh \
 	functional/mmp/mmp_interval.ksh \
 	functional/mmp/mmp_on_off.ksh \
 	functional/mmp/mmp_on_thread.ksh \
 	functional/mmp/mmp_on_uberblocks.ksh \
 	functional/mmp/mmp_on_zdb.ksh \
 	functional/mmp/mmp_reset_interval.ksh \
 	functional/mmp/mmp_write_distribution.ksh \
 	functional/mmp/mmp_write_slow_disk.ksh \
 	functional/mmp/mmp_write_uberblocks.ksh \
 	functional/mmp/multihost_history.ksh \
 	functional/mmp/setup.ksh \
 	functional/mount/cleanup.ksh \
 	functional/mount/setup.ksh \
 	functional/mount/umount_001.ksh \
 	functional/mount/umountall_001.ksh \
 	functional/mount/umount_unlinked_drain.ksh \
 	functional/mv_files/cleanup.ksh \
 	functional/mv_files/mv_files_001_pos.ksh \
 	functional/mv_files/mv_files_002_pos.ksh \
 	functional/mv_files/random_creation.ksh \
 	functional/mv_files/setup.ksh \
 	functional/nestedfs/cleanup.ksh \
 	functional/nestedfs/nestedfs_001_pos.ksh \
 	functional/nestedfs/setup.ksh \
 	functional/nopwrite/cleanup.ksh \
 	functional/nopwrite/nopwrite_copies.ksh \
 	functional/nopwrite/nopwrite_mtime.ksh \
 	functional/nopwrite/nopwrite_negative.ksh \
 	functional/nopwrite/nopwrite_promoted_clone.ksh \
 	functional/nopwrite/nopwrite_recsize.ksh \
 	functional/nopwrite/nopwrite_sync.ksh \
 	functional/nopwrite/nopwrite_varying_compression.ksh \
 	functional/nopwrite/nopwrite_volume.ksh \
 	functional/nopwrite/setup.ksh \
 	functional/no_space/cleanup.ksh \
 	functional/no_space/enospc_001_pos.ksh \
 	functional/no_space/enospc_002_pos.ksh \
 	functional/no_space/enospc_003_pos.ksh \
 	functional/no_space/enospc_df.ksh \
 	functional/no_space/enospc_ganging.ksh \
 	functional/no_space/enospc_rm.ksh \
 	functional/no_space/setup.ksh \
 	functional/online_offline/cleanup.ksh \
 	functional/online_offline/online_offline_001_pos.ksh \
 	functional/online_offline/online_offline_002_neg.ksh \
 	functional/online_offline/online_offline_003_neg.ksh \
 	functional/online_offline/setup.ksh \
 	functional/pam/cleanup.ksh \
 	functional/pam/pam_basic.ksh \
 	functional/pam/pam_change_unmounted.ksh \
 	functional/pam/pam_nounmount.ksh \
 	functional/pam/pam_recursive.ksh \
 	functional/pam/pam_short_password.ksh \
 	functional/pam/setup.ksh \
 	functional/pool_checkpoint/checkpoint_after_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_big_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_capacity.ksh \
 	functional/pool_checkpoint/checkpoint_conf_change.ksh \
 	functional/pool_checkpoint/checkpoint_discard_busy.ksh \
 	functional/pool_checkpoint/checkpoint_discard.ksh \
 	functional/pool_checkpoint/checkpoint_discard_many.ksh \
 	functional/pool_checkpoint/checkpoint_indirect.ksh \
 	functional/pool_checkpoint/checkpoint_invalid.ksh \
 	functional/pool_checkpoint/checkpoint_lun_expsz.ksh \
 	functional/pool_checkpoint/checkpoint_open.ksh \
 	functional/pool_checkpoint/checkpoint_removal.ksh \
 	functional/pool_checkpoint/checkpoint_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_ro_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_sm_scale.ksh \
 	functional/pool_checkpoint/checkpoint_twice.ksh \
 	functional/pool_checkpoint/checkpoint_vdev_add.ksh \
 	functional/pool_checkpoint/checkpoint_zdb.ksh \
 	functional/pool_checkpoint/checkpoint_zhack_feat.ksh \
 	functional/pool_checkpoint/cleanup.ksh \
 	functional/pool_checkpoint/setup.ksh \
 	functional/pool_names/pool_names_001_pos.ksh \
 	functional/pool_names/pool_names_002_neg.ksh \
 	functional/poolversion/cleanup.ksh \
 	functional/poolversion/poolversion_001_pos.ksh \
 	functional/poolversion/poolversion_002_pos.ksh \
 	functional/poolversion/setup.ksh \
 	functional/privilege/cleanup.ksh \
 	functional/privilege/privilege_001_pos.ksh \
 	functional/privilege/privilege_002_pos.ksh \
 	functional/privilege/setup.ksh \
 	functional/procfs/cleanup.ksh \
 	functional/procfs/pool_state.ksh \
 	functional/procfs/procfs_list_basic.ksh \
 	functional/procfs/procfs_list_concurrent_readers.ksh \
 	functional/procfs/procfs_list_stale_read.ksh \
 	functional/procfs/setup.ksh \
 	functional/projectquota/cleanup.ksh \
 	functional/projectquota/projectid_001_pos.ksh \
 	functional/projectquota/projectid_002_pos.ksh \
 	functional/projectquota/projectid_003_pos.ksh \
 	functional/projectquota/projectquota_001_pos.ksh \
 	functional/projectquota/projectquota_002_pos.ksh \
 	functional/projectquota/projectquota_003_pos.ksh \
 	functional/projectquota/projectquota_004_neg.ksh \
 	functional/projectquota/projectquota_005_pos.ksh \
 	functional/projectquota/projectquota_006_pos.ksh \
 	functional/projectquota/projectquota_007_pos.ksh \
 	functional/projectquota/projectquota_008_pos.ksh \
 	functional/projectquota/projectquota_009_pos.ksh \
 	functional/projectquota/projectspace_001_pos.ksh \
 	functional/projectquota/projectspace_002_pos.ksh \
 	functional/projectquota/projectspace_003_pos.ksh \
 	functional/projectquota/projectspace_004_pos.ksh \
 	functional/projectquota/projecttree_001_pos.ksh \
 	functional/projectquota/projecttree_002_pos.ksh \
 	functional/projectquota/projecttree_003_neg.ksh \
 	functional/projectquota/setup.ksh \
 	functional/quota/cleanup.ksh \
 	functional/quota/quota_001_pos.ksh \
 	functional/quota/quota_002_pos.ksh \
 	functional/quota/quota_003_pos.ksh \
 	functional/quota/quota_004_pos.ksh \
 	functional/quota/quota_005_pos.ksh \
 	functional/quota/quota_006_neg.ksh \
 	functional/quota/setup.ksh \
 	functional/raidz/cleanup.ksh \
 	functional/raidz/raidz_001_neg.ksh \
 	functional/raidz/raidz_002_pos.ksh \
 	functional/raidz/raidz_003_pos.ksh \
 	functional/raidz/raidz_004_pos.ksh \
 	functional/raidz/setup.ksh \
 	functional/redacted_send/cleanup.ksh \
 	functional/redacted_send/redacted_compressed.ksh \
 	functional/redacted_send/redacted_contents.ksh \
 	functional/redacted_send/redacted_deleted.ksh \
 	functional/redacted_send/redacted_disabled_feature.ksh \
 	functional/redacted_send/redacted_embedded.ksh \
 	functional/redacted_send/redacted_holes.ksh \
 	functional/redacted_send/redacted_incrementals.ksh \
 	functional/redacted_send/redacted_largeblocks.ksh \
 	functional/redacted_send/redacted_many_clones.ksh \
 	functional/redacted_send/redacted_mixed_recsize.ksh \
 	functional/redacted_send/redacted_mounts.ksh \
 	functional/redacted_send/redacted_negative.ksh \
 	functional/redacted_send/redacted_origin.ksh \
 	functional/redacted_send/redacted_panic.ksh \
 	functional/redacted_send/redacted_props.ksh \
 	functional/redacted_send/redacted_resume.ksh \
 	functional/redacted_send/redacted_size.ksh \
 	functional/redacted_send/redacted_volume.ksh \
 	functional/redacted_send/setup.ksh \
 	functional/redundancy/cleanup.ksh \
 	functional/redundancy/redundancy_draid1.ksh \
 	functional/redundancy/redundancy_draid2.ksh \
 	functional/redundancy/redundancy_draid3.ksh \
 	functional/redundancy/redundancy_draid_damaged1.ksh \
 	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
 	functional/redundancy/redundancy_draid_spare3.ksh \
 	functional/redundancy/redundancy_mirror.ksh \
 	functional/redundancy/redundancy_raidz1.ksh \
 	functional/redundancy/redundancy_raidz2.ksh \
 	functional/redundancy/redundancy_raidz3.ksh \
 	functional/redundancy/redundancy_raidz.ksh \
 	functional/redundancy/redundancy_stripe.ksh \
 	functional/redundancy/setup.ksh \
 	functional/refquota/cleanup.ksh \
 	functional/refquota/refquota_001_pos.ksh \
 	functional/refquota/refquota_002_pos.ksh \
 	functional/refquota/refquota_003_pos.ksh \
 	functional/refquota/refquota_004_pos.ksh \
 	functional/refquota/refquota_005_pos.ksh \
 	functional/refquota/refquota_006_neg.ksh \
 	functional/refquota/refquota_007_neg.ksh \
 	functional/refquota/refquota_008_neg.ksh \
 	functional/refquota/setup.ksh \
 	functional/refreserv/cleanup.ksh \
 	functional/refreserv/refreserv_001_pos.ksh \
 	functional/refreserv/refreserv_002_pos.ksh \
 	functional/refreserv/refreserv_003_pos.ksh \
 	functional/refreserv/refreserv_004_pos.ksh \
 	functional/refreserv/refreserv_005_pos.ksh \
 	functional/refreserv/refreserv_multi_raidz.ksh \
 	functional/refreserv/refreserv_raidz.ksh \
 	functional/refreserv/setup.ksh \
 	functional/removal/cleanup.ksh \
 	functional/removal/removal_all_vdev.ksh \
 	functional/removal/removal_cancel.ksh \
 	functional/removal/removal_check_space.ksh \
 	functional/removal/removal_condense_export.ksh \
 	functional/removal/removal_multiple_indirection.ksh \
 	functional/removal/removal_nopwrite.ksh \
 	functional/removal/removal_remap_deadlists.ksh \
 	functional/removal/removal_reservation.ksh \
 	functional/removal/removal_resume_export.ksh \
 	functional/removal/removal_sanity.ksh \
 	functional/removal/removal_with_add.ksh \
 	functional/removal/removal_with_create_fs.ksh \
 	functional/removal/removal_with_dedup.ksh \
 	functional/removal/removal_with_errors.ksh \
 	functional/removal/removal_with_export.ksh \
 	functional/removal/removal_with_faulted.ksh \
 	functional/removal/removal_with_ganging.ksh \
 	functional/removal/removal_with_indirect.ksh \
 	functional/removal/removal_with_remove.ksh \
 	functional/removal/removal_with_scrub.ksh \
 	functional/removal/removal_with_send.ksh \
 	functional/removal/removal_with_send_recv.ksh \
 	functional/removal/removal_with_snapshot.ksh \
 	functional/removal/removal_with_write.ksh \
 	functional/removal/removal_with_zdb.ksh \
 	functional/removal/remove_attach_mirror.ksh \
 	functional/removal/remove_expanded.ksh \
 	functional/removal/remove_indirect.ksh \
 	functional/removal/remove_mirror.ksh \
 	functional/removal/remove_mirror_sanity.ksh \
 	functional/removal/remove_raidz.ksh \
 	functional/rename_dirs/cleanup.ksh \
 	functional/rename_dirs/rename_dirs_001_pos.ksh \
 	functional/rename_dirs/setup.ksh \
 	functional/renameat2/cleanup.ksh \
 	functional/renameat2/setup.ksh \
 	functional/renameat2/renameat2_exchange.ksh \
 	functional/renameat2/renameat2_noreplace.ksh \
 	functional/renameat2/renameat2_whiteout.ksh \
 	functional/replacement/attach_import.ksh \
 	functional/replacement/attach_multiple.ksh \
 	functional/replacement/attach_rebuild.ksh \
 	functional/replacement/attach_resilver.ksh \
 	functional/replacement/cleanup.ksh \
 	functional/replacement/detach.ksh \
 	functional/replacement/rebuild_disabled_feature.ksh \
 	functional/replacement/rebuild_multiple.ksh \
 	functional/replacement/rebuild_raidz.ksh \
 	functional/replacement/replace_import.ksh \
 	functional/replacement/replace_rebuild.ksh \
 	functional/replacement/replace_resilver.ksh \
 	functional/replacement/resilver_restart_001.ksh \
 	functional/replacement/resilver_restart_002.ksh \
 	functional/replacement/scrub_cancel.ksh \
 	functional/replacement/setup.ksh \
 	functional/reservation/cleanup.ksh \
 	functional/reservation/reservation_001_pos.ksh \
 	functional/reservation/reservation_002_pos.ksh \
 	functional/reservation/reservation_003_pos.ksh \
 	functional/reservation/reservation_004_pos.ksh \
 	functional/reservation/reservation_005_pos.ksh \
 	functional/reservation/reservation_006_pos.ksh \
 	functional/reservation/reservation_007_pos.ksh \
 	functional/reservation/reservation_008_pos.ksh \
 	functional/reservation/reservation_009_pos.ksh \
 	functional/reservation/reservation_010_pos.ksh \
 	functional/reservation/reservation_011_pos.ksh \
 	functional/reservation/reservation_012_pos.ksh \
 	functional/reservation/reservation_013_pos.ksh \
 	functional/reservation/reservation_014_pos.ksh \
 	functional/reservation/reservation_015_pos.ksh \
 	functional/reservation/reservation_016_pos.ksh \
 	functional/reservation/reservation_017_pos.ksh \
 	functional/reservation/reservation_018_pos.ksh \
 	functional/reservation/reservation_019_pos.ksh \
 	functional/reservation/reservation_020_pos.ksh \
 	functional/reservation/reservation_021_neg.ksh \
 	functional/reservation/reservation_022_pos.ksh \
 	functional/reservation/setup.ksh \
 	functional/rootpool/cleanup.ksh \
 	functional/rootpool/rootpool_002_neg.ksh \
 	functional/rootpool/rootpool_003_neg.ksh \
 	functional/rootpool/rootpool_007_pos.ksh \
 	functional/rootpool/setup.ksh \
 	functional/rsend/cleanup.ksh \
 	functional/rsend/recv_dedup_encrypted_zvol.ksh \
 	functional/rsend/recv_dedup.ksh \
 	functional/rsend/rsend_001_pos.ksh \
 	functional/rsend/rsend_002_pos.ksh \
 	functional/rsend/rsend_003_pos.ksh \
 	functional/rsend/rsend_004_pos.ksh \
 	functional/rsend/rsend_005_pos.ksh \
 	functional/rsend/rsend_006_pos.ksh \
 	functional/rsend/rsend_007_pos.ksh \
 	functional/rsend/rsend_008_pos.ksh \
 	functional/rsend/rsend_009_pos.ksh \
 	functional/rsend/rsend_010_pos.ksh \
 	functional/rsend/rsend_011_pos.ksh \
 	functional/rsend/rsend_012_pos.ksh \
 	functional/rsend/rsend_013_pos.ksh \
 	functional/rsend/rsend_014_pos.ksh \
 	functional/rsend/rsend_016_neg.ksh \
 	functional/rsend/rsend_019_pos.ksh \
 	functional/rsend/rsend_020_pos.ksh \
 	functional/rsend/rsend_021_pos.ksh \
 	functional/rsend/rsend_022_pos.ksh \
 	functional/rsend/rsend_024_pos.ksh \
 	functional/rsend/rsend_025_pos.ksh \
 	functional/rsend/rsend_026_neg.ksh \
 	functional/rsend/rsend_027_pos.ksh \
 	functional/rsend/rsend_028_neg.ksh \
 	functional/rsend/rsend_029_neg.ksh \
 	functional/rsend/rsend_030_pos.ksh \
 	functional/rsend/rsend_031_pos.ksh \
 	functional/rsend/send-c_embedded_blocks.ksh \
 	functional/rsend/send-c_incremental.ksh \
 	functional/rsend/send-c_lz4_disabled.ksh \
 	functional/rsend/send-c_mixed_compression.ksh \
 	functional/rsend/send-c_props.ksh \
 	functional/rsend/send-c_recv_dedup.ksh \
 	functional/rsend/send-c_recv_lz4_disabled.ksh \
 	functional/rsend/send-c_resume.ksh \
 	functional/rsend/send-c_stream_size_estimate.ksh \
 	functional/rsend/send-c_verify_contents.ksh \
 	functional/rsend/send-c_verify_ratio.ksh \
 	functional/rsend/send-c_volume.ksh \
 	functional/rsend/send-c_zstream_recompress.ksh \
 	functional/rsend/send-c_zstreamdump.ksh \
 	functional/rsend/send-cpL_varied_recsize.ksh \
 	functional/rsend/send_doall.ksh \
 	functional/rsend/send_encrypted_incremental.ksh \
 	functional/rsend/send_encrypted_files.ksh \
 	functional/rsend/send_encrypted_freeobjects.ksh \
 	functional/rsend/send_encrypted_hierarchy.ksh \
 	functional/rsend/send_encrypted_props.ksh \
 	functional/rsend/send_encrypted_truncated_files.ksh \
 	functional/rsend/send_freeobjects.ksh \
 	functional/rsend/send_holds.ksh \
 	functional/rsend/send_hole_birth.ksh \
 	functional/rsend/send_invalid.ksh \
 	functional/rsend/send-L_toggle.ksh \
 	functional/rsend/send_mixed_raw.ksh \
 	functional/rsend/send_partial_dataset.ksh \
 	functional/rsend/send_raw_ashift.ksh \
 	functional/rsend/send_raw_spill_block.ksh \
 	functional/rsend/send_raw_large_blocks.ksh \
 	functional/rsend/send_realloc_dnode_size.ksh \
 	functional/rsend/send_realloc_encrypted_files.ksh \
 	functional/rsend/send_realloc_files.ksh \
 	functional/rsend/send_spill_block.ksh \
 	functional/rsend/send-wR_encrypted_zvol.ksh \
 	functional/rsend/setup.ksh \
 	functional/scrub_mirror/cleanup.ksh \
 	functional/scrub_mirror/scrub_mirror_001_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_002_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_003_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_004_pos.ksh \
 	functional/scrub_mirror/setup.ksh \
 	functional/slog/cleanup.ksh \
 	functional/slog/setup.ksh \
 	functional/slog/slog_001_pos.ksh \
 	functional/slog/slog_002_pos.ksh \
 	functional/slog/slog_003_pos.ksh \
 	functional/slog/slog_004_pos.ksh \
 	functional/slog/slog_005_pos.ksh \
 	functional/slog/slog_006_pos.ksh \
 	functional/slog/slog_007_pos.ksh \
 	functional/slog/slog_008_neg.ksh \
 	functional/slog/slog_009_neg.ksh \
 	functional/slog/slog_010_neg.ksh \
 	functional/slog/slog_011_neg.ksh \
 	functional/slog/slog_012_neg.ksh \
 	functional/slog/slog_013_pos.ksh \
 	functional/slog/slog_014_pos.ksh \
 	functional/slog/slog_015_neg.ksh \
 	functional/slog/slog_016_pos.ksh \
 	functional/slog/slog_replay_fs_001.ksh \
 	functional/slog/slog_replay_fs_002.ksh \
 	functional/slog/slog_replay_volume.ksh \
 	functional/snapshot/cleanup.ksh \
 	functional/snapshot/clone_001_pos.ksh \
 	functional/snapshot/rollback_001_pos.ksh \
 	functional/snapshot/rollback_002_pos.ksh \
 	functional/snapshot/rollback_003_pos.ksh \
 	functional/snapshot/setup.ksh \
 	functional/snapshot/snapshot_001_pos.ksh \
 	functional/snapshot/snapshot_002_pos.ksh \
 	functional/snapshot/snapshot_003_pos.ksh \
 	functional/snapshot/snapshot_004_pos.ksh \
 	functional/snapshot/snapshot_005_pos.ksh \
 	functional/snapshot/snapshot_006_pos.ksh \
 	functional/snapshot/snapshot_007_pos.ksh \
 	functional/snapshot/snapshot_008_pos.ksh \
 	functional/snapshot/snapshot_009_pos.ksh \
 	functional/snapshot/snapshot_010_pos.ksh \
 	functional/snapshot/snapshot_011_pos.ksh \
 	functional/snapshot/snapshot_012_pos.ksh \
 	functional/snapshot/snapshot_013_pos.ksh \
 	functional/snapshot/snapshot_014_pos.ksh \
 	functional/snapshot/snapshot_015_pos.ksh \
 	functional/snapshot/snapshot_016_pos.ksh \
 	functional/snapshot/snapshot_017_pos.ksh \
 	functional/snapshot/snapshot_018_pos.ksh \
 	functional/snapused/cleanup.ksh \
 	functional/snapused/setup.ksh \
 	functional/snapused/snapused_001_pos.ksh \
 	functional/snapused/snapused_002_pos.ksh \
 	functional/snapused/snapused_003_pos.ksh \
 	functional/snapused/snapused_004_pos.ksh \
 	functional/snapused/snapused_005_pos.ksh \
 	functional/sparse/cleanup.ksh \
 	functional/sparse/setup.ksh \
 	functional/sparse/sparse_001_pos.ksh \
 	functional/stat/cleanup.ksh \
 	functional/stat/setup.ksh \
 	functional/stat/stat_001_pos.ksh \
 	functional/suid/cleanup.ksh \
 	functional/suid/setup.ksh \
 	functional/suid/suid_write_to_none.ksh \
 	functional/suid/suid_write_to_sgid.ksh \
 	functional/suid/suid_write_to_suid.ksh \
 	functional/suid/suid_write_to_suid_sgid.ksh \
 	functional/suid/suid_write_zil_replay.ksh \
 	functional/trim/autotrim_config.ksh \
 	functional/trim/autotrim_integrity.ksh \
 	functional/trim/autotrim_trim_integrity.ksh \
 	functional/trim/cleanup.ksh \
 	functional/trim/setup.ksh \
 	functional/trim/trim_config.ksh \
 	functional/trim/trim_integrity.ksh \
 	functional/trim/trim_l2arc.ksh \
 	functional/truncate/cleanup.ksh \
 	functional/truncate/setup.ksh \
 	functional/truncate/truncate_001_pos.ksh \
 	functional/truncate/truncate_002_pos.ksh \
 	functional/truncate/truncate_timestamps.ksh \
 	functional/upgrade/cleanup.ksh \
 	functional/upgrade/setup.ksh \
 	functional/upgrade/upgrade_projectquota_001_pos.ksh \
 	functional/upgrade/upgrade_readonly_pool.ksh \
 	functional/upgrade/upgrade_userobj_001_pos.ksh \
 	functional/user_namespace/cleanup.ksh \
 	functional/user_namespace/setup.ksh \
 	functional/user_namespace/user_namespace_001.ksh \
 	functional/user_namespace/user_namespace_002.ksh \
 	functional/user_namespace/user_namespace_003.ksh \
 	functional/user_namespace/user_namespace_004.ksh \
 	functional/userquota/cleanup.ksh \
 	functional/userquota/groupspace_001_pos.ksh \
 	functional/userquota/groupspace_002_pos.ksh \
 	functional/userquota/groupspace_003_pos.ksh \
 	functional/userquota/setup.ksh \
 	functional/userquota/userquota_001_pos.ksh \
 	functional/userquota/userquota_002_pos.ksh \
 	functional/userquota/userquota_003_pos.ksh \
 	functional/userquota/userquota_004_pos.ksh \
 	functional/userquota/userquota_005_neg.ksh \
 	functional/userquota/userquota_006_pos.ksh \
 	functional/userquota/userquota_007_pos.ksh \
 	functional/userquota/userquota_008_pos.ksh \
 	functional/userquota/userquota_009_pos.ksh \
 	functional/userquota/userquota_010_pos.ksh \
 	functional/userquota/userquota_011_pos.ksh \
 	functional/userquota/userquota_012_neg.ksh \
 	functional/userquota/userquota_013_pos.ksh \
 	functional/userquota/userspace_001_pos.ksh \
 	functional/userquota/userspace_002_pos.ksh \
 	functional/userquota/userspace_003_pos.ksh \
 	functional/userquota/userspace_encrypted.ksh \
 	functional/userquota/userspace_send_encrypted.ksh \
 	functional/userquota/userspace_encrypted_13709.ksh \
 	functional/vdev_zaps/cleanup.ksh \
 	functional/vdev_zaps/setup.ksh \
 	functional/vdev_zaps/vdev_zaps_001_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_002_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_003_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_004_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_005_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_006_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_007_pos.ksh \
 	functional/write_dirs/cleanup.ksh \
 	functional/write_dirs/setup.ksh \
 	functional/write_dirs/write_dirs_001_pos.ksh \
 	functional/write_dirs/write_dirs_002_pos.ksh \
 	functional/xattr/cleanup.ksh \
 	functional/xattr/setup.ksh \
 	functional/xattr/xattr_001_pos.ksh \
 	functional/xattr/xattr_002_neg.ksh \
 	functional/xattr/xattr_003_neg.ksh \
 	functional/xattr/xattr_004_pos.ksh \
 	functional/xattr/xattr_005_pos.ksh \
 	functional/xattr/xattr_006_pos.ksh \
 	functional/xattr/xattr_007_neg.ksh \
 	functional/xattr/xattr_008_pos.ksh \
 	functional/xattr/xattr_009_neg.ksh \
 	functional/xattr/xattr_010_neg.ksh \
 	functional/xattr/xattr_011_pos.ksh \
 	functional/xattr/xattr_012_pos.ksh \
 	functional/xattr/xattr_013_pos.ksh \
 	functional/xattr/xattr_compat.ksh \
 	functional/zpool_influxdb/cleanup.ksh \
 	functional/zpool_influxdb/setup.ksh \
 	functional/zpool_influxdb/zpool_influxdb.ksh \
 	functional/zvol/zvol_cli/cleanup.ksh \
 	functional/zvol/zvol_cli/setup.ksh \
 	functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \
 	functional/zvol/zvol_ENOSPC/cleanup.ksh \
 	functional/zvol/zvol_ENOSPC/setup.ksh \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \
 	functional/zvol/zvol_misc/cleanup.ksh \
 	functional/zvol/zvol_misc/setup.ksh \
 	functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_fua.ksh \
 	functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \
 	functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \
 	functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \
 	functional/zvol/zvol_misc/zvol_misc_trim.ksh \
 	functional/zvol/zvol_misc/zvol_misc_volmode.ksh \
 	functional/zvol/zvol_misc/zvol_misc_zil.ksh \
 	functional/zvol/zvol_stress/cleanup.ksh \
 	functional/zvol/zvol_stress/setup.ksh \
 	functional/zvol/zvol_stress/zvol_stress.ksh \
 	functional/zvol/zvol_swap/cleanup.ksh \
 	functional/zvol/zvol_swap/setup.ksh \
 	functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \
 	functional/idmap_mount/cleanup.ksh \
 	functional/idmap_mount/setup.ksh \
 	functional/idmap_mount/idmap_mount_001.ksh \
 	functional/idmap_mount/idmap_mount_002.ksh \
 	functional/idmap_mount/idmap_mount_003.ksh \
 	functional/idmap_mount/idmap_mount_004.ksh \
 	functional/idmap_mount/idmap_mount_005.ksh
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
index e204f43b3bcd..795e71b26b5a 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib
@@ -1,68 +1,68 @@
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 
 #
 # Copyright (c) 2017, Intel Corporation.
 # Copyright (c) 2018 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/alloc_class/alloc_class.cfg
 
 function disk_setup
 {
 	truncate -s $ZPOOL_DEVSIZE $ZPOOL_DISKS
 	truncate -s $CLASS_DEVSIZE $CLASS_DISKS
 }
 
 function disk_cleanup
 {
 	rm -f $ZPOOL_DEVSIZE $ZPOOL_DISKS 2> /dev/null
 	rm -f $CLASS_DEVSIZE $CLASS_DISKS 2> /dev/null
 }
 
 function cleanup
 {
 	if datasetexists $TESTPOOL ; then
 		zpool destroy -f $TESTPOOL 2> /dev/null
 	fi
 
 	disk_cleanup
 }
 
 #
 # Try zpool status/iostat for given pool
 #
 # $1 pool
 #
 function display_status
 {
 	typeset pool=$1
 
 	typeset -i ret=0
 	zpool status -xv $pool > /dev/null 2>&1
 	ret=$?
 
 	zpool iostat > /dev/null 2>&1
 	((ret |= $?))
 
 	typeset mntpnt=$(get_prop mountpoint $pool)
-	dd if=/dev/random of=$mntpnt/testfile.$$ &
+	dd if=/dev/urandom of=$mntpnt/testfile.$$ &
 	typeset pid=$!
 
 	zpool iostat -v 1 3 > /dev/null
 	((ret |= $?))
 
 	kill -9 $pid
 	wait $pid 2> /dev/null
 
 	return $ret
 }
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
new file mode 100755
index 000000000000..3632fc9a4df0
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/block_cloning/block_cloning_rlimit_fsize.ksh
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib
+
+#
+# DESCRIPTION:
+#	When block cloning is used to implement copy_file_range(2), the
+#	RLIMIT_FSIZE limit must be respected.
+#
+# STRATEGY:
+#	1. Create a pool.
+#	2. ???
+#
+
+verify_runnable "global"
+
+VDIR=$TEST_BASE_DIR/disk-bclone
+VDEV="$VDIR/a"
+
+function cleanup
+{
+	datasetexists $TESTPOOL && destroy_pool $TESTPOOL
+	rm -rf $VDIR
+}
+
+log_onexit cleanup
+
+log_assert "Test for RLIMIT_FSIZE handling with block cloning enabled"
+
+log_must rm -rf $VDIR
+log_must mkdir -p $VDIR
+log_must truncate -s 1G $VDEV
+
+log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV
+
+log_must dd if=/dev/urandom of=/$TESTPOOL/file1 bs=1 count=1000
+
+ulimit -f 2
+log_must clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file2 0 0 all
+ulimit -f 1
+log_mustnot clonefile -f /$TESTPOOL/file1 /$TESTPOOL/file3 0 0 all
+
+log_pass "copy_file_range(2) respects RLIMIT_FSIZE"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib
index 9911ccdf536d..a7a93a3046d2 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib
@@ -1,157 +1,159 @@
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 
 #
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
 #
 # Copyright (c) 2012, 2016 by Delphix. All rights reserved.
 #
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/cli_root/zfs_copies/zfs_copies.cfg
 
 #
 # Compare the value of copies property with specified value
 # $1, the dataset name
 # $2, the expected copies value
 #
 function cmp_prop
 {
 	typeset ds=$1
 	typeset	val_expect=$2
 	typeset val_actual
 
 	val_actual=$(get_prop copies $ds)
 	if [[ $val_actual != $val_expect ]]; then
 		log_fail "Expected value ($val_expect) != actual value " \
 		    "($val_actual)"
 	fi
 }
 
 #
 # Check the used space is charged correctly
 # $1, the number of used space
 # $2, the expected common factor between the used space and the file space
 #
 function check_used
 {
 	typeset charged_spc=$1
 	typeset -i used
 	typeset -i expected_cfactor=$2
 	typeset -i cfactor
 	typeset -i fsize=${FILESIZE%[m|M]}
 
 	((used = $charged_spc / 1024 / 1024))
 	((cfactor = used / fsize))
 	if ((cfactor != expected_cfactor)); then
 		log_fail "The space is not charged correctly while setting" \
 		    "copies as $expected_cfactor."
 	fi
 }
 
 #
 # test ncopies on volume
 # $1  test type zfs|ufs|ext2
 # $2  copies
 # $3  mntp for ufs|ext2 test
 function do_vol_test
 {
 	typeset type=$1
 	typeset copies=$2
 	typeset mntp=$3
 
 	vol=$TESTPOOL/$TESTVOL1
 	vol_b_path=$ZVOL_DEVDIR/$TESTPOOL/$TESTVOL1
 
 	log_must zfs create -V $VOLSIZE -o compression=off -o copies=$copies \
 	    $vol
 	log_must zfs set refreservation=none $vol
 	block_device_wait $vol_b_path
 
 	case "$type" in
 	"ext2")
 		if is_freebsd; then
 			log_unsupported "ext2 test not implemented for freebsd"
 		fi
 		log_must eval "new_fs $vol_b_path >/dev/null 2>&1"
 		log_must mount -o rw $vol_b_path $mntp
 		;;
 	"ufs")
 		if is_linux; then
 			log_unsupported "ufs test not implemented for linux"
 		fi
 		log_must eval "new_fs $vol_b_path >/dev/null 2>&1"
 		log_must mount $vol_b_path $mntp
 		;;
 	"zfs")
 		if is_freebsd; then
 			# Pool creation on zvols is forbidden by default.
 			# Save and restore the current setting.
 			typeset _saved=$(get_tunable VOL_RECURSIVE)
 			log_must set_tunable64 VOL_RECURSIVE 1 # Allow
 			zpool create $TESTPOOL1 $vol_b_path
 			typeset _zpool_create_result=$?
 			log_must set_tunable64 VOL_RECURSIVE $_saved # Restore
 			log_must test $_zpool_create_result = 0
 		else
 			log_must zpool create $TESTPOOL1 $vol_b_path
 		fi
 		log_must zfs create -o compression=off $TESTPOOL1/$TESTFS1
 		sync_pool $TESTPOOL1
 		;;
 	*)
 		log_unsupported "$type test not implemented"
 		;;
 	esac
 
 	sync_pool $TESTPOOL
 	pre_used=$(get_prop used $vol)
 
 	if [[ $type == "zfs" ]]; then
 		log_must mkfile $FILESIZE /$TESTPOOL1/$TESTFS1/$FILE
 		sync_pool $TESTPOOL1
 	else
 		log_must mkfile $FILESIZE $mntp/$FILE
 		log_must sync
 	fi
 
 	sync_pool $TESTPOOL
 	post_used=$(get_prop used $vol)
 
 	((used = post_used - pre_used))
 	((nfilesize = copies * ${FILESIZE%m}))
 	if ((used < nfilesize)); then
 		log_fail "The space is not charged correctly while setting" \
 		    "copies as $copies ($used < $nfilesize)" \
 		    "pre=${pre_used} post=${post_used}"
 	fi
 
 	if [[ $type == "zfs" ]]; then
 		log_must zpool destroy $TESTPOOL1
 	else
 		log_must umount $mntp
 	fi
 
+	# Ubuntu 20.04 wants a sync here
+	log_must sync
 	log_must zfs destroy $vol
 }
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh
index 041dadb1eadb..05f3ac708477 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/suspend_resume_single.ksh
@@ -1,102 +1,102 @@
 #!/bin/ksh -p
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 
 #
 # Copyright (c) 2024, Klara Inc.
 #
 
 . $STF_SUITE/include/libtest.shlib
 
 set -x
 
 DATAFILE="$TMPDIR/datafile"
 
 function cleanup
 {
 	destroy_pool $TESTPOOL
 	unload_scsi_debug
 	rm -f $DATA_FILE
 }
 
 log_onexit cleanup
 
 log_assert "ensure single-disk pool resumes properly after suspend and clear"
 
 # create a file, and take a checksum, so we can compare later
-log_must dd if=/dev/random of=$DATAFILE bs=128K count=1
+log_must dd if=/dev/urandom of=$DATAFILE bs=128K count=1
 typeset sum1=$(cat $DATAFILE | md5sum)
 
 # make a debug device that we can "unplug"
 load_scsi_debug 100 1 1 1 '512b'
 sd=$(get_debug_device)
 
 # create a single-device pool
 log_must zpool create $TESTPOOL $sd
 log_must zpool sync
 
 # "pull" the disk
 log_must eval "echo offline > /sys/block/$sd/device/state"
 
 # copy data onto the pool. it'll appear to succeed, but only be in memory
 log_must cp $DATAFILE /$TESTPOOL/file
 
 # wait until sync starts, and the pool suspends
 log_note "waiting for pool to suspend"
 typeset -i tries=10
 until [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; do
 	if ((tries-- == 0)); then
 		log_fail "pool didn't suspend"
 	fi
 	sleep 1
 done
 
 # return the disk
 log_must eval "echo running > /sys/block/$sd/device/state"
 
 # clear the error states, which should reopen the vdev, get the pool back
 # online, and replay the failed IO
 log_must zpool clear $TESTPOOL
 
 # wait a while for everything to sync out. if something is going to go wrong,
 # this is where it will happen
 log_note "giving pool time to settle and complete txg"
 sleep 7
 
 # if the pool suspended, then everything is bad
 if [[ $(cat /proc/spl/kstat/zfs/$TESTPOOL/state) == "SUSPENDED" ]] ; then
 	log_fail "pool suspended"
 fi
 
 # export the pool, to make sure it exports clean, and also to clear the file
 # out of the cache
 log_must zpool export $TESTPOOL
 
 # import the pool
 log_must zpool import $TESTPOOL
 
 # sum the file we wrote earlier
 typeset sum2=$(cat /$TESTPOOL/file | md5sum)
 
 # make sure the checksums match
 log_must test "$sum1" = "$sum2"
 
 log_pass "single-disk pool resumes properly after disk suspend and clear"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/history/history.cfg b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/history/history.cfg
index a508a7935684..6020443bcdb0 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/history/history.cfg
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/history/history.cfg
@@ -1,49 +1,45 @@
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 
 #
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
 #
 
 #
 # Copyright (c) 2013 by Delphix. All rights reserved.
 #
 
 export ZFSROOT=
 
 export MPOOL=mpool.$$
 
 export OLD_HISTORY=$TEST_BASE_DIR/old_history.$$
 export TMP_HISTORY=$TEST_BASE_DIR/tmp_history.$$
 export NEW_HISTORY=$TEST_BASE_DIR/new_history.$$
 
 export MIGRATEDPOOLNAME=${MIGRATEDPOOLNAME:-history_pool}
-if is_freebsd; then
-	export TIMEZONE=${TIMEZONE:-America/Denver}
-else
-	export TIMEZONE=${TIMEZONE:-US/Mountain}
-fi
+export TIMEZONE=${TIMEZONE:-America/Denver}
 
 export HIST_USER="huser"
 export HIST_GROUP="hgroup"
 
 export TESTVOL=testvol.$$
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/io/io_uring.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/io/io_uring.ksh
index 2fa146556358..f14b9f450826 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/io/io_uring.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/io/io_uring.ksh
@@ -1,79 +1,79 @@
 #! /bin/ksh -p
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 
 #
 # Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
 #
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/io/io.cfg
 
 #
 # DESCRIPTION:
 #	Verify Linux io_uring.
 #
 # STRATEGY:
 #	1. Use fio(1) in verify mode to perform write, read,
 #	   random read, and random write workloads.
 #	2. Repeat the test with additional fio(1) options.
 #
 
 verify_runnable "global"
 
 
 if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then
-	log_unsupported "Requires io_uring support"
+	log_unsupported "Requires io_uring support within Kernel"
 fi
 
 if [ -e /etc/os-release ] ; then
 	source /etc/os-release
-	if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then
-		log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'"
+	if [ $PLATFORM_ID = "platform:el9" ]; then
+		log_unsupported "Disabled on RHEL 9 variants: fails with 'Operation not permitted'"
 	fi
 fi
 
 fio --ioengine=io_uring --parse-only || log_unsupported "fio io_uring support required"
 
 function cleanup
 {
 	log_must rm -f "$mntpnt/rw*"
 }
 
 log_assert "Verify Linux io_uring"
 
 log_onexit cleanup
 
 ioengine="--ioengine=io_uring"
 mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
 dir="--directory=$mntpnt"
 
 set -A fio_arg -- "--sync=0" "--sync=1" "--direct=0" "--direct=1"
 
 for arg in "${fio_arg[@]}"; do
 	log_must fio $dir $ioengine $arg $FIO_WRITE_ARGS
 	log_must fio $dir $ioengine $arg $FIO_READ_ARGS
 	log_must fio $dir $ioengine $arg $FIO_RANDWRITE_ARGS
 	log_must fio $dir $ioengine $arg $FIO_RANDREAD_ARGS
 	log_must rm -f "$mntpnt/rw*"
 done
 
 log_pass "Verified Linux io_uring"
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index af6633ec205e..6f1ec6dd8d83 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -1,1236 +1,1248 @@
 /*
  */
 
 /* zfs_config.h.  Generated from zfs_config.h.in by configure.  */
 /* zfs_config.h.in.  Generated from configure.ac by autoheader.  */
 
 /* Define to 1 if translation of program messages to the user's native
    language is requested. */
 /* #undef ENABLE_NLS */
 
 /* bio_end_io_t wants 1 arg */
 /* #undef HAVE_1ARG_BIO_END_IO_T */
 
 /* lookup_bdev() wants 1 arg */
 /* #undef HAVE_1ARG_LOOKUP_BDEV */
 
 /* submit_bio() wants 1 arg */
 /* #undef HAVE_1ARG_SUBMIT_BIO */
 
 /* bdi_setup_and_register() wants 2 args */
 /* #undef HAVE_2ARGS_BDI_SETUP_AND_REGISTER */
 
 /* vfs_getattr wants 2 args */
 /* #undef HAVE_2ARGS_VFS_GETATTR */
 
 /* zlib_deflate_workspacesize() wants 2 args */
 /* #undef HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE */
 
 /* bdi_setup_and_register() wants 3 args */
 /* #undef HAVE_3ARGS_BDI_SETUP_AND_REGISTER */
 
 /* vfs_getattr wants 3 args */
 /* #undef HAVE_3ARGS_VFS_GETATTR */
 
 /* vfs_getattr wants 4 args */
 /* #undef HAVE_4ARGS_VFS_GETATTR */
 
 /* kernel has access_ok with 'type' parameter */
 /* #undef HAVE_ACCESS_OK_TYPE */
 
 /* posix_acl has refcount_t */
 /* #undef HAVE_ACL_REFCOUNT */
 
 /* add_disk() returns int */
 /* #undef HAVE_ADD_DISK_RET */
 
 /* Define if host toolchain supports AES */
 #define HAVE_AES 1
 
 /* Define if you have [rt] */
 #define HAVE_AIO_H 1
 
 #ifdef __amd64__
 #ifndef RESCUE
 /* Define if host toolchain supports AVX */
 #define HAVE_AVX 1
 #endif
 
 /* Define if host toolchain supports AVX2 */
 #define HAVE_AVX2 1
 
 /* Define if host toolchain supports AVX512BW */
 #define HAVE_AVX512BW 1
 
 /* Define if host toolchain supports AVX512CD */
 #define HAVE_AVX512CD 1
 
 /* Define if host toolchain supports AVX512DQ */
 #define HAVE_AVX512DQ 1
 
 /* Define if host toolchain supports AVX512ER */
 #define HAVE_AVX512ER 1
 
 /* Define if host toolchain supports AVX512F */
 #define HAVE_AVX512F 1
 
 /* Define if host toolchain supports AVX512IFMA */
 #define HAVE_AVX512IFMA 1
 
 /* Define if host toolchain supports AVX512PF */
 #define HAVE_AVX512PF 1
 
 /* Define if host toolchain supports AVX512VBMI */
 #define HAVE_AVX512VBMI 1
 
 /* Define if host toolchain supports AVX512VL */
 #define HAVE_AVX512VL 1
 #endif
 
 /* backtrace() is available */
 /* #undef HAVE_BACKTRACE */
 
 /* bdevname() is available */
 /* #undef HAVE_BDEVNAME */
 
 /* bdev_check_media_change() exists */
 /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */
 
 /* bdev_file_open_by_path() exists */
 /* #undef HAVE_BDEV_FILE_OPEN_BY_PATH */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_63 */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_OLD */
 
 /* bdev_kobj() exists */
 /* #undef HAVE_BDEV_KOBJ */
 
 /* bdev_max_discard_sectors() is available */
 /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */
 
 /* bdev_max_secure_erase_sectors() is available */
 /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */
 
 /* bdev_nr_bytes() is available */
 /* #undef HAVE_BDEV_NR_BYTES */
 
 /* bdev_open_by_path() exists */
 /* #undef HAVE_BDEV_OPEN_BY_PATH */
 
 /* bdev_release() exists */
 /* #undef HAVE_BDEV_RELEASE */
 
 /* block_device_operations->submit_bio() returns void */
 /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */
 
 /* bdev_whole() is available */
 /* #undef HAVE_BDEV_WHOLE */
 
 /* bio_alloc() takes 4 arguments */
 /* #undef HAVE_BIO_ALLOC_4ARG */
 
 /* bio->bi_bdev->bd_disk exists */
 /* #undef HAVE_BIO_BDEV_DISK */
 
 /* bio->bi_opf is defined */
 /* #undef HAVE_BIO_BI_OPF */
 
 /* bio->bi_status exists */
 /* #undef HAVE_BIO_BI_STATUS */
 
 /* bio has bi_iter */
 /* #undef HAVE_BIO_BVEC_ITER */
 
 /* bio_*_io_acct() available */
 /* #undef HAVE_BIO_IO_ACCT */
 
 /* bio_max_segs() is implemented */
 /* #undef HAVE_BIO_MAX_SEGS */
 
 /* bio_set_dev() is available */
 /* #undef HAVE_BIO_SET_DEV */
 
 /* bio_set_dev() GPL-only */
 /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */
 
 /* bio_set_dev() is a macro */
 /* #undef HAVE_BIO_SET_DEV_MACRO */
 
 /* bio_set_op_attrs is available */
 /* #undef HAVE_BIO_SET_OP_ATTRS */
 
 /* blkdev_get_by_path() exists and takes 4 args */
 /* #undef HAVE_BLKDEV_GET_BY_PATH_4ARG */
 
 /* blkdev_get_by_path() handles ERESTARTSYS */
 /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */
 
 /* __blkdev_issue_discard(flags) is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS */
 
 /* __blkdev_issue_discard() is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS */
 
 /* blkdev_issue_discard(flags) is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_FLAGS */
 
 /* blkdev_issue_discard() is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS */
 
 /* blkdev_issue_secure_erase() is available */
 /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */
 
 /* blkdev_put() exists */
 /* #undef HAVE_BLKDEV_PUT */
 
 /* blkdev_put() accepts void* as arg 2 */
 /* #undef HAVE_BLKDEV_PUT_HOLDER */
 
 /* blkdev_reread_part() exists */
 /* #undef HAVE_BLKDEV_REREAD_PART */
 
 /* blkg_tryget() is available */
 /* #undef HAVE_BLKG_TRYGET */
 
 /* blkg_tryget() GPL-only */
 /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */
 
 /* blk_alloc_disk() exists */
 /* #undef HAVE_BLK_ALLOC_DISK */
 
 /* blk_alloc_disk() exists and takes 2 args */
 /* #undef HAVE_BLK_ALLOC_DISK_2ARG */
 
 /* blk_alloc_queue() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */
 
 /* blk_alloc_queue_rh() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */
 
 /* blk_cleanup_disk() exists */
 /* #undef HAVE_BLK_CLEANUP_DISK */
 
 /* blk_mode_t is defined */
 /* #undef HAVE_BLK_MODE_T */
 
 /* block multiqueue is available */
 /* #undef HAVE_BLK_MQ */
 
 /* block multiqueue hardware context is cached in struct request */
 /* #undef HAVE_BLK_MQ_RQ_HCTX */
 
 /* blk queue backing_dev_info is dynamic */
 /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */
 
 /* blk_queue_discard() is available */
 /* #undef HAVE_BLK_QUEUE_DISCARD */
 
+/* backing_dev_info is available through queue gendisk */
+/* #undef HAVE_BLK_QUEUE_DISK_BDI */
+
 /* blk_queue_flag_clear() exists */
 /* #undef HAVE_BLK_QUEUE_FLAG_CLEAR */
 
 /* blk_queue_flag_set() exists */
 /* #undef HAVE_BLK_QUEUE_FLAG_SET */
 
 /* blk_queue_flush() is available */
 /* #undef HAVE_BLK_QUEUE_FLUSH */
 
 /* blk_queue_flush() is GPL-only */
 /* #undef HAVE_BLK_QUEUE_FLUSH_GPL_ONLY */
 
 /* blk_queue_secdiscard() is available */
 /* #undef HAVE_BLK_QUEUE_SECDISCARD */
 
 /* blk_queue_secure_erase() is available */
 /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */
 
 /* blk_queue_update_readahead() exists */
 /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */
 
 /* blk_queue_write_cache() exists */
 /* #undef HAVE_BLK_QUEUE_WRITE_CACHE */
 
 /* blk_queue_write_cache() is GPL-only */
 /* #undef HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY */
 
 /* BLK_STS_RESV_CONFLICT is defined */
 /* #undef HAVE_BLK_STS_RESV_CONFLICT */
 
 /* Define if release() in block_device_operations takes 1 arg */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG */
 
 /* Define if revalidate_disk() in block_device_operations */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */
 
 /* Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the
    CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYCURRENT */
 
 /* Define to 1 if you have the Mac OS X function
    CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */
 
 /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in
    the CoreFoundation framework. */
 /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */
 
 /* check_disk_change() exists */
 /* #undef HAVE_CHECK_DISK_CHANGE */
 
 /* clear_inode() is available */
 /* #undef HAVE_CLEAR_INODE */
 
 /* dentry uses const struct dentry_operations */
 /* #undef HAVE_CONST_DENTRY_OPERATIONS */
 
 /* copy_from_iter() is available */
 /* #undef HAVE_COPY_FROM_ITER */
 
 /* copy_splice_read exists */
 /* #undef HAVE_COPY_SPLICE_READ */
 
 /* copy_to_iter() is available */
 /* #undef HAVE_COPY_TO_ITER */
 
 /* cpu_has_feature() is GPL-only */
 /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */
 
 /* yes */
 /* #undef HAVE_CPU_HOTPLUG */
 
 /* current_time() exists */
 /* #undef HAVE_CURRENT_TIME */
 
 /* Define if the GNU dcgettext() function is already present or preinstalled.
    */
 /* #undef HAVE_DCGETTEXT */
 
 /* DECLARE_EVENT_CLASS() is available */
 /* #undef HAVE_DECLARE_EVENT_CLASS */
 
 /* dentry aliases are in d_u member */
 /* #undef HAVE_DENTRY_D_U_ALIASES */
 
 /* dequeue_signal() takes 4 arguments */
 /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */
 
 /* lookup_bdev() wants dev_t arg */
 /* #undef HAVE_DEVT_LOOKUP_BDEV */
 
 /* sops->dirty_inode() wants flags */
 /* #undef HAVE_DIRTY_INODE_WITH_FLAGS */
 
 /* disk_check_media_change() exists */
 /* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */
 
 /* disk_*_io_acct() available */
 /* #undef HAVE_DISK_IO_ACCT */
 
 /* disk_update_readahead() exists */
 /* #undef HAVE_DISK_UPDATE_READAHEAD */
 
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
 /* d_make_root() is available */
 /* #undef HAVE_D_MAKE_ROOT */
 
 /* d_prune_aliases() is available */
 /* #undef HAVE_D_PRUNE_ALIASES */
 
 /* dops->d_revalidate() operation takes nameidata */
 /* #undef HAVE_D_REVALIDATE_NAMEIDATA */
 
 /* eops->encode_fh() wants child and parent inodes */
 /* #undef HAVE_ENCODE_FH_WITH_INODE */
 
 /* sops->evict_inode() exists */
 /* #undef HAVE_EVICT_INODE */
 
 /* Define to 1 if you have the 'execvpe' function. */
 #define HAVE_EXECVPE 1
 
 /* FALLOC_FL_ZERO_RANGE is defined */
 /* #undef HAVE_FALLOC_FL_ZERO_RANGE */
 
 /* fault_in_iov_iter_readable() is available */
 /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */
 
 /* filemap_range_has_page() is available */
 /* #undef HAVE_FILEMAP_RANGE_HAS_PAGE */
 
 /* fops->aio_fsync() exists */
 /* #undef HAVE_FILE_AIO_FSYNC */
 
 /* file_dentry() is available */
 /* #undef HAVE_FILE_DENTRY */
 
 /* fops->fadvise() exists */
 /* #undef HAVE_FILE_FADVISE */
 
 /* file_inode() is available */
 /* #undef HAVE_FILE_INODE */
 
 /* flush_dcache_page() is GPL-only */
 /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */
 
 /* iops->follow_link() cookie */
 /* #undef HAVE_FOLLOW_LINK_COOKIE */
 
 /* iops->follow_link() nameidata */
 /* #undef HAVE_FOLLOW_LINK_NAMEIDATA */
 
 /* Define if compiler supports -Wformat-overflow */
 /* #undef HAVE_FORMAT_OVERFLOW */
 
 /* fsync_bdev() is declared in include/blkdev.h */
 /* #undef HAVE_FSYNC_BDEV */
 
 /* fops->fsync() with range */
 /* #undef HAVE_FSYNC_RANGE */
 
 /* fops->fsync() without dentry */
 /* #undef HAVE_FSYNC_WITHOUT_DENTRY */
 
 /* yes */
 /* #undef HAVE_GENERIC_FADVISE */
 
 /* generic_fillattr requires struct mnt_idmap* */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP */
 
 /* generic_fillattr requires struct mnt_idmap* and u32 request_mask */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK */
 
 /* generic_fillattr requires struct user_namespace* */
 /* #undef HAVE_GENERIC_FILLATTR_USERNS */
 
 /* generic_*_io_acct() 3 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_3ARG */
 
 /* generic_*_io_acct() 4 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_4ARG */
 
 /* generic_readlink is global */
 /* #undef HAVE_GENERIC_READLINK */
 
 /* generic_setxattr() exists */
 /* #undef HAVE_GENERIC_SETXATTR */
 
 /* generic_write_checks() takes kiocb */
 /* #undef HAVE_GENERIC_WRITE_CHECKS_KIOCB */
 
 /* Define if the GNU gettext() function is already present or preinstalled. */
 /* #undef HAVE_GETTEXT */
 
 /* Define to 1 if you have the 'gettid' function. */
 /* #undef HAVE_GETTID */
 
 /* iops->get_acl() exists */
 /* #undef HAVE_GET_ACL */
 
 /* iops->get_acl() takes rcu */
 /* #undef HAVE_GET_ACL_RCU */
 
 /* has iops->get_inode_acl() */
 /* #undef HAVE_GET_INODE_ACL */
 
 /* iops->get_link() cookie */
 /* #undef HAVE_GET_LINK_COOKIE */
 
 /* iops->get_link() delayed */
 /* #undef HAVE_GET_LINK_DELAYED */
 
 /* group_info->gid exists */
 /* #undef HAVE_GROUP_INFO_GID */
 
 /* has_capability() is available */
 /* #undef HAVE_HAS_CAPABILITY */
 
 /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */
 /* #undef HAVE_IATTR_VFSID */
 
 /* Define if you have the iconv() function and it works. */
 #define HAVE_ICONV 1
 
 /* iops->getattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_GETATTR */
 
 /* iops->setattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_SETATTR */
 
 /* APIs for idmapped mount are present */
 /* #undef HAVE_IDMAP_MNT_API */
 
 /* mnt_idmap does not have user_namespace */
 /* #undef HAVE_IDMAP_NO_USERNS */
 
 /* Define if compiler supports -Wimplicit-fallthrough */
 /* #undef HAVE_IMPLICIT_FALLTHROUGH */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_INFINITE_RECURSION */
 
 /* inode_get_atime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_ATIME */
 
 /* inode_get_ctime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_CTIME */
 
 /* inode_get_mtime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_MTIME */
 
 /* yes */
 /* #undef HAVE_INODE_LOCK_SHARED */
 
 /* inode_owner_or_capable() exists */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE */
 
 /* inode_owner_or_capable() takes mnt_idmap */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */
 
 /* inode_owner_or_capable() takes user_ns */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */
 
 /* inode_set_atime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_ATIME_TO_TS */
 
 /* inode_set_ctime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_CTIME_TO_TS */
 
 /* inode_set_flags() exists */
 /* #undef HAVE_INODE_SET_FLAGS */
 
 /* inode_set_iversion() exists */
 /* #undef HAVE_INODE_SET_IVERSION */
 
 /* inode_set_mtime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_MTIME_TO_TS */
 
 /* inode->i_*time's are timespec64 */
 /* #undef HAVE_INODE_TIMESPEC64_TIMES */
 
 /* timestamp_truncate() exists */
 /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
 /* in_compat_syscall() is available */
 /* #undef HAVE_IN_COMPAT_SYSCALL */
 
 /* iops->create() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_CREATE_IDMAP */
 
 /* iops->create() takes struct user_namespace* */
 /* #undef HAVE_IOPS_CREATE_USERNS */
 
 /* iops->mkdir() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKDIR_IDMAP */
 
 /* iops->mkdir() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKDIR_USERNS */
 
 /* iops->mknod() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKNOD_IDMAP */
 
 /* iops->mknod() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKNOD_USERNS */
 
 /* iops->permission() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_PERMISSION_IDMAP */
 
 /* iops->permission() takes struct user_namespace* */
 /* #undef HAVE_IOPS_PERMISSION_USERNS */
 
 /* iops->rename() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_RENAME_IDMAP */
 
 /* iops->rename() takes struct user_namespace* */
 /* #undef HAVE_IOPS_RENAME_USERNS */
 
 /* iops->setattr() exists */
 /* #undef HAVE_IOPS_SETATTR */
 
 /* iops->symlink() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_SYMLINK_IDMAP */
 
 /* iops->symlink() takes struct user_namespace* */
 /* #undef HAVE_IOPS_SYMLINK_USERNS */
 
 /* iov_iter_advance() is available */
 /* #undef HAVE_IOV_ITER_ADVANCE */
 
 /* iov_iter_count() is available */
 /* #undef HAVE_IOV_ITER_COUNT */
 
 /* iov_iter_fault_in_readable() is available */
 /* #undef HAVE_IOV_ITER_FAULT_IN_READABLE */
 
 /* iov_iter_revert() is available */
 /* #undef HAVE_IOV_ITER_REVERT */
 
 /* iov_iter_type() is available */
 /* #undef HAVE_IOV_ITER_TYPE */
 
 /* iov_iter types are available */
 /* #undef HAVE_IOV_ITER_TYPES */
 
 /* yes */
 /* #undef HAVE_IO_SCHEDULE_TIMEOUT */
 
 /* Define to 1 if you have the 'issetugid' function. */
 #define HAVE_ISSETUGID 1
 
 /* iter_iov() is available */
 /* #undef HAVE_ITER_IOV */
 
 /* kernel has kernel_fpu_* functions */
 /* #undef HAVE_KERNEL_FPU */
 
 /* kernel has asm/fpu/api.h */
 /* #undef HAVE_KERNEL_FPU_API_HEADER */
 
 /* kernel fpu internal */
 /* #undef HAVE_KERNEL_FPU_INTERNAL */
 
 /* kernel has asm/fpu/internal.h */
 /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */
 
 /* uncached_acl_sentinel() exists */
 /* #undef HAVE_KERNEL_GET_ACL_HANDLE_CACHE */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_KERNEL_INFINITE_RECURSION */
 
 /* kernel defines intptr_t */
 /* #undef HAVE_KERNEL_INTPTR_T */
 
 /* kernel has kernel_neon_* functions */
 /* #undef HAVE_KERNEL_NEON */
 
 /* kernel does stack verification */
 /* #undef HAVE_KERNEL_OBJTOOL */
 
 /* kernel has linux/objtool.h */
 /* #undef HAVE_KERNEL_OBJTOOL_HEADER */
 
 /* kernel_read() take loff_t pointer */
 /* #undef HAVE_KERNEL_READ_PPOS */
 
 /* strlcpy() exists */
 /* #undef HAVE_KERNEL_STRLCPY */
 
 /* strscpy() exists */
 /* #undef HAVE_KERNEL_STRSCPY */
 
 /* timer_list.function gets a timer_list */
 /* #undef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST */
 
 /* struct timer_list has a flags member */
 /* #undef HAVE_KERNEL_TIMER_LIST_FLAGS */
 
 /* timer_setup() is available */
 /* #undef HAVE_KERNEL_TIMER_SETUP */
 
 /* kernel_write() take loff_t pointer */
 /* #undef HAVE_KERNEL_WRITE_PPOS */
 
 /* kmem_cache_create_usercopy() exists */
 /* #undef HAVE_KMEM_CACHE_CREATE_USERCOPY */
 
 /* kstrtoul() exists */
 /* #undef HAVE_KSTRTOUL */
 
 /* ktime_get_coarse_real_ts64() exists */
 /* #undef HAVE_KTIME_GET_COARSE_REAL_TS64 */
 
 /* ktime_get_raw_ts64() exists */
 /* #undef HAVE_KTIME_GET_RAW_TS64 */
 
 /* kvmalloc exists */
 /* #undef HAVE_KVMALLOC */
 
 /* Define if you have [aio] */
 /* #undef HAVE_LIBAIO */
 
 /* Define if you have [blkid] */
 /* #undef HAVE_LIBBLKID */
 
 /* Define if you have [crypto] */
 #define HAVE_LIBCRYPTO 1
 
 /* Define if you have [tirpc] */
 /* #undef HAVE_LIBTIRPC */
 
 /* Define if you have [udev] */
 /* #undef HAVE_LIBUDEV */
 
 /* Define if you have [unwind] */
 /* #undef HAVE_LIBUNWIND */
 
 /* libunwind has unw_get_elf_filename */
 /* #undef HAVE_LIBUNWIND_ELF */
 
 /* Define if you have [uuid] */
 /* #undef HAVE_LIBUUID */
 
 /* linux/blk-cgroup.h exists */
 /* #undef HAVE_LINUX_BLK_CGROUP_HEADER */
 
 /* lseek_execute() is available */
 /* #undef HAVE_LSEEK_EXECUTE */
 
 /* makedev() is declared in sys/mkdev.h */
 /* #undef HAVE_MAKEDEV_IN_MKDEV */
 
 /* makedev() is declared in sys/sysmacros.h */
 /* #undef HAVE_MAKEDEV_IN_SYSMACROS */
 
 /* Noting that make_request_fn() returns blk_qc_t */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */
 
 /* Noting that make_request_fn() returns void */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_VOID */
 
 /* iops->mkdir() takes umode_t */
 /* #undef HAVE_MKDIR_UMODE_T */
 
 /* Define to 1 if you have the 'mlockall' function. */
 #define HAVE_MLOCKALL 1
 
+/* page_mapping() is available */
+/* #undef HAVE_MM_PAGE_MAPPING */
+
 /* page_size() is available */
 /* #undef HAVE_MM_PAGE_SIZE */
 
 /* lookup_bdev() wants mode arg */
 /* #undef HAVE_MODE_LOOKUP_BDEV */
 
 /* Define if host toolchain supports MOVBE */
 #define HAVE_MOVBE 1
 
 /* new_sync_read()/new_sync_write() are available */
 /* #undef HAVE_NEW_SYNC_READ */
 
 /* folio_wait_bit() exists */
 /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */
 
 /* part_to_dev() exists */
 /* #undef HAVE_PART_TO_DEV */
 
 /* iops->getattr() takes a path */
 /* #undef HAVE_PATH_IOPS_GETATTR */
 
 /* Define if host toolchain supports PCLMULQDQ */
 #define HAVE_PCLMULQDQ 1
 
 /* percpu_counter_add_batch() is defined */
 /* #undef HAVE_PERCPU_COUNTER_ADD_BATCH */
 
 /* percpu_counter_init() wants gfp_t */
 /* #undef HAVE_PERCPU_COUNTER_INIT_WITH_GFP */
 
 /* posix_acl_chmod() exists */
 /* #undef HAVE_POSIX_ACL_CHMOD */
 
 /* posix_acl_from_xattr() needs user_ns */
 /* #undef HAVE_POSIX_ACL_FROM_XATTR_USERNS */
 
 /* posix_acl_release() is available */
 /* #undef HAVE_POSIX_ACL_RELEASE */
 
 /* posix_acl_release() is GPL-only */
 /* #undef HAVE_POSIX_ACL_RELEASE_GPL_ONLY */
 
 /* posix_acl_valid() wants user namespace */
 /* #undef HAVE_POSIX_ACL_VALID_WITH_NS */
 
+/* proc_handler ctl_table arg is const */
+/* #undef HAVE_PROC_HANDLER_CTL_TABLE_CONST */
+
 /* proc_ops structure exists */
 /* #undef HAVE_PROC_OPS_STRUCT */
 
 /* iops->put_link() cookie */
 /* #undef HAVE_PUT_LINK_COOKIE */
 
 /* iops->put_link() delayed */
 /* #undef HAVE_PUT_LINK_DELAYED */
 
 /* iops->put_link() nameidata */
 /* #undef HAVE_PUT_LINK_NAMEIDATA */
 
 /* If available, contains the Python version number currently in use. */
 #define HAVE_PYTHON "3.7"
 
 /* qat is enabled and existed */
 /* #undef HAVE_QAT */
 
 /* struct reclaim_state has reclaimed */
 /* #undef HAVE_RECLAIM_STATE_RECLAIMED */
 
 /* register_shrinker is vararg */
 /* #undef HAVE_REGISTER_SHRINKER_VARARG */
 
+/* register_sysctl_sz exists */
+/* #undef HAVE_REGISTER_SYSCTL_SZ */
+
 /* register_sysctl_table exists */
 /* #undef HAVE_REGISTER_SYSCTL_TABLE */
 
 /* iops->rename2() exists */
 /* #undef HAVE_RENAME2 */
 
 /* struct inode_operations_wrapper takes .rename2() */
 /* #undef HAVE_RENAME2_OPERATIONS_WRAPPER */
 
 /* iops->rename() wants flags */
 /* #undef HAVE_RENAME_WANTS_FLAGS */
 
 /* REQ_DISCARD is defined */
 /* #undef HAVE_REQ_DISCARD */
 
 /* REQ_FLUSH is defined */
 /* #undef HAVE_REQ_FLUSH */
 
 /* REQ_OP_DISCARD is defined */
 /* #undef HAVE_REQ_OP_DISCARD */
 
 /* REQ_OP_FLUSH is defined */
 /* #undef HAVE_REQ_OP_FLUSH */
 
 /* REQ_OP_SECURE_ERASE is defined */
 /* #undef HAVE_REQ_OP_SECURE_ERASE */
 
 /* REQ_PREFLUSH is defined */
 /* #undef HAVE_REQ_PREFLUSH */
 
 /* revalidate_disk() is available */
 /* #undef HAVE_REVALIDATE_DISK */
 
 /* revalidate_disk_size() is available */
 /* #undef HAVE_REVALIDATE_DISK_SIZE */
 
 /* struct rw_semaphore has member activity */
 /* #undef HAVE_RWSEM_ACTIVITY */
 
 /* struct rw_semaphore has atomic_long_t member count */
 /* #undef HAVE_RWSEM_ATOMIC_LONG_COUNT */
 
 /* linux/sched/signal.h exists */
 /* #undef HAVE_SCHED_SIGNAL_HEADER */
 
 /* Define to 1 if you have the <security/pam_modules.h> header file. */
 #define HAVE_SECURITY_PAM_MODULES_H 1
 
 /* setattr_prepare() accepts mnt_idmap */
 /* #undef HAVE_SETATTR_PREPARE_IDMAP */
 
 /* setattr_prepare() is available, doesn't accept user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */
 
 /* setattr_prepare() accepts user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_USERNS */
 
 /* iops->set_acl() exists, takes 3 args */
 /* #undef HAVE_SET_ACL */
 
 /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */
 /* #undef HAVE_SET_ACL_IDMAP_DENTRY */
 
 /* iops->set_acl() takes 4 args */
 /* #undef HAVE_SET_ACL_USERNS */
 
 /* iops->set_acl() takes 4 args, arg2 is struct dentry * */
 /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */
 
 /* set_cached_acl() is usable */
 /* #undef HAVE_SET_CACHED_ACL_USABLE */
 
 /* set_special_state() exists */
 /* #undef HAVE_SET_SPECIAL_STATE */
 
 /* shrinker_register exists */
 /* #undef HAVE_SHRINKER_REGISTER */
 
 /* struct shrink_control exists */
 /* #undef HAVE_SHRINK_CONTROL_STRUCT */
 
 /* kernel_siginfo_t exists */
 /* #undef HAVE_SIGINFO */
 
 /* signal_stop() exists */
 /* #undef HAVE_SIGNAL_STOP */
 
 /* new shrinker callback wants 2 args */
 /* #undef HAVE_SINGLE_SHRINKER_CALLBACK */
 
 /* cs->count_objects exists */
 /* #undef HAVE_SPLIT_SHRINKER_CALLBACK */
 
 #if defined(__amd64__) || defined(__i386__)
 /* Define if host toolchain supports SSE */
 #define HAVE_SSE 1
 
 /* Define if host toolchain supports SSE2 */
 #define HAVE_SSE2 1
 
 /* Define if host toolchain supports SSE3 */
 #define HAVE_SSE3 1
 
 /* Define if host toolchain supports SSE4.1 */
 #define HAVE_SSE4_1 1
 
 /* Define if host toolchain supports SSE4.2 */
 #define HAVE_SSE4_2 1
 
 /* Define if host toolchain supports SSSE3 */
 #define HAVE_SSSE3 1
 #endif
 
 /* STACK_FRAME_NON_STANDARD is defined */
 /* #undef HAVE_STACK_FRAME_NON_STANDARD */
 
 /* standalone <linux/stdarg.h> exists */
 /* #undef HAVE_STANDALONE_LINUX_STDARG */
 
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdio.h> header file. */
 #define HAVE_STDIO_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the 'strlcat' function. */
 #define HAVE_STRLCAT 1
 
 /* Define to 1 if you have the 'strlcpy' function. */
 #define HAVE_STRLCPY 1
 
 /* submit_bio is member of struct block_device_operations */
 /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 
 /* have super_block s_shrink */
 /* #undef HAVE_SUPER_BLOCK_S_SHRINK */
 
 /* have super_block s_shrink pointer */
 /* #undef HAVE_SUPER_BLOCK_S_SHRINK_PTR */
 
 /* super_setup_bdi_name() exits */
 /* #undef HAVE_SUPER_SETUP_BDI_NAME */
 
 /* super_block->s_user_ns exists */
 /* #undef HAVE_SUPER_USER_NS */
 
 /* sync_blockdev() is declared in include/blkdev.h */
 /* #undef HAVE_SYNC_BLOCKDEV */
 
 /* struct kobj_type has default_groups */
 /* #undef HAVE_SYSFS_DEFAULT_GROUPS */
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* i_op->tmpfile() exists */
 /* #undef HAVE_TMPFILE */
 
 /* i_op->tmpfile() uses old dentry signature */
 /* #undef HAVE_TMPFILE_DENTRY */
 
 /* i_op->tmpfile() has mnt_idmap */
 /* #undef HAVE_TMPFILE_IDMAP */
 
 /* i_op->tmpfile() has userns */
 /* #undef HAVE_TMPFILE_USERNS */
 
 /* totalhigh_pages() exists */
 /* #undef HAVE_TOTALHIGH_PAGES */
 
 /* kernel has totalram_pages() */
 /* #undef HAVE_TOTALRAM_PAGES_FUNC */
 
 /* Define to 1 if you have the 'udev_device_get_is_initialized' function. */
 /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */
 
 /* kernel has __kernel_fpu_* functions */
 /* #undef HAVE_UNDERSCORE_KERNEL_FPU */
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
 /* iops->getattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_GETATTR */
 
 /* iops->setattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_SETATTR */
 
 /* user_namespace->ns.inum exists */
 /* #undef HAVE_USER_NS_COMMON_INUM */
 
 /* iops->getattr() takes a vfsmount */
 /* #undef HAVE_VFSMOUNT_IOPS_GETATTR */
 
 /* fops->clone_file_range() is available */
 /* #undef HAVE_VFS_CLONE_FILE_RANGE */
 
 /* fops->copy_file_range() is available */
 /* #undef HAVE_VFS_COPY_FILE_RANGE */
 
 /* fops->dedupe_file_range() is available */
 /* #undef HAVE_VFS_DEDUPE_FILE_RANGE */
 
 /* aops->direct_IO() uses iovec */
 /* #undef HAVE_VFS_DIRECT_IO_IOVEC */
 
 /* aops->direct_IO() uses iov_iter without rw */
 /* #undef HAVE_VFS_DIRECT_IO_ITER */
 
 /* aops->direct_IO() uses iov_iter with offset */
 /* #undef HAVE_VFS_DIRECT_IO_ITER_OFFSET */
 
 /* aops->direct_IO() uses iov_iter with rw and offset */
 /* #undef HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET */
 
 /* filemap_dirty_folio exists */
 /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */
 
 /* file_operations_extend takes .copy_file_range() and .clone_file_range() */
 /* #undef HAVE_VFS_FILE_OPERATIONS_EXTEND */
 
 /* generic_copy_file_range() is available */
 /* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
 /* All required iov_iter interfaces are available */
 /* #undef HAVE_VFS_IOV_ITER */
 
 /* fops->iterate() is available */
 /* #undef HAVE_VFS_ITERATE */
 
 /* fops->iterate_shared() is available */
 /* #undef HAVE_VFS_ITERATE_SHARED */
 
 /* fops->readdir() is available */
 /* #undef HAVE_VFS_READDIR */
 
 /* address_space_operations->readpages exists */
 /* #undef HAVE_VFS_READPAGES */
 
 /* read_folio exists */
 /* #undef HAVE_VFS_READ_FOLIO */
 
 /* fops->remap_file_range() is available */
 /* #undef HAVE_VFS_REMAP_FILE_RANGE */
 
 /* fops->read/write_iter() are available */
 /* #undef HAVE_VFS_RW_ITERATE */
 
 /* __set_page_dirty_nobuffers exists */
 /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */
 
 /* splice_copy_file_range() is available */
 /* #undef HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 
 /* __vmalloc page flags exists */
 /* #undef HAVE_VMALLOC_PAGE_KERNEL */
 
 /* yes */
 /* #undef HAVE_WAIT_ON_BIT_ACTION */
 
 /* wait_queue_entry_t exists */
 /* #undef HAVE_WAIT_QUEUE_ENTRY_T */
 
 /* wq_head->head and wq_entry->entry exist */
 /* #undef HAVE_WAIT_QUEUE_HEAD_ENTRY */
 
 /* int (*writepage_t)() takes struct folio* */
 /* #undef HAVE_WRITEPAGE_T_FOLIO */
 
 /* xattr_handler->get() wants dentry */
 /* #undef HAVE_XATTR_GET_DENTRY */
 
 /* xattr_handler->get() wants both dentry and inode */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE */
 
 /* xattr_handler->get() wants dentry and inode and flags */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */
 
 /* xattr_handler->get() wants xattr_handler */
 /* #undef HAVE_XATTR_GET_HANDLER */
 
 /* xattr_handler has name */
 /* #undef HAVE_XATTR_HANDLER_NAME */
 
 /* xattr_handler->list() wants dentry */
 /* #undef HAVE_XATTR_LIST_DENTRY */
 
 /* xattr_handler->list() wants xattr_handler */
 /* #undef HAVE_XATTR_LIST_HANDLER */
 
 /* xattr_handler->list() wants simple */
 /* #undef HAVE_XATTR_LIST_SIMPLE */
 
 /* xattr_handler->set() wants dentry */
 /* #undef HAVE_XATTR_SET_DENTRY */
 
 /* xattr_handler->set() wants both dentry and inode */
 /* #undef HAVE_XATTR_SET_DENTRY_INODE */
 
 /* xattr_handler->set() wants xattr_handler */
 /* #undef HAVE_XATTR_SET_HANDLER */
 
 /* xattr_handler->set() takes mnt_idmap */
 /* #undef HAVE_XATTR_SET_IDMAP */
 
 /* xattr_handler->set() takes user_namespace */
 /* #undef HAVE_XATTR_SET_USERNS */
 
 /* Define if host toolchain supports XSAVE */
 #define HAVE_XSAVE 1
 
 /* Define if host toolchain supports XSAVEOPT */
 #define HAVE_XSAVEOPT 1
 
 /* Define if host toolchain supports XSAVES */
 #define HAVE_XSAVES 1
 
 /* ZERO_PAGE() is GPL-only */
 /* #undef HAVE_ZERO_PAGE_GPL_ONLY */
 
 /* Define if you have [z] */
 #define HAVE_ZLIB 1
 
 /* __posix_acl_chmod() exists */
 /* #undef HAVE___POSIX_ACL_CHMOD */
 
 /* kernel exports FPU functions */
 /* #undef KERNEL_EXPORTS_X86_FPU */
 
 /* TBD: fetch(3) support */
 #if 0
 /* whether the chosen libfetch is to be loaded at run-time */
 #define LIBFETCH_DYNAMIC 1
 
 /* libfetch is fetch(3) */
 #define LIBFETCH_IS_FETCH 1
 
 /* libfetch is libcurl */
 #define LIBFETCH_IS_LIBCURL 0
 
 /* soname of chosen libfetch */
 #define LIBFETCH_SONAME "libfetch.so.6"
 #endif
 
 /* Define to the sub-directory where libtool stores uninstalled libraries. */
 #define LT_OBJDIR ".libs/"
 
 /* make_request_fn() return type */
 /* #undef MAKE_REQUEST_FN_RET */
 
 /* struct shrink_control has nid */
 /* #undef SHRINK_CONTROL_HAS_NID */
 
 /* using complete_and_exit() instead */
 /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */
 
 /* Defined for legacy compatibility. */
 #define SPL_META_ALIAS ZFS_META_ALIAS
 
 /* Defined for legacy compatibility. */
 #define SPL_META_RELEASE ZFS_META_RELEASE
 
 /* Defined for legacy compatibility. */
 #define SPL_META_VERSION ZFS_META_VERSION
 
 /* pde_data() is PDE_DATA() */
 /* #undef SPL_PDE_DATA */
 
 /* Define to 1 if all of the C89 standard headers exist (not just the ones
    required in a freestanding environment). This macro is provided for
    backward compatibility; new code need not use it. */
 #define SYSTEM_FREEBSD 1
 
 /* True if ZFS is to be compiled for a Linux system */
 /* #undef SYSTEM_LINUX */
 
 /* Version number of package */
 /* #undef ZFS_DEBUG */
 
 /* /dev/zfs minor */
 /* #undef ZFS_DEVICE_MINOR */
 
 /* enum node_stat_item contains NR_FILE_PAGES */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES */
 
 /* enum node_stat_item contains NR_INACTIVE_ANON */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON */
 
 /* enum node_stat_item contains NR_INACTIVE_FILE */
 /* #undef ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE */
 
 /* enum zone_stat_item contains NR_FILE_PAGES */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_FILE_PAGES */
 
 /* enum zone_stat_item contains NR_INACTIVE_ANON */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_ANON */
 
 /* enum zone_stat_item contains NR_INACTIVE_FILE */
 /* #undef ZFS_ENUM_ZONE_STAT_ITEM_NR_INACTIVE_FILE */
 
 /* GENHD_FL_EXT_DEVT flag is not available */
 /* #undef ZFS_GENHD_FL_EXT_DEVT */
 
 /* GENHD_FL_NO_PART_SCAN flag is available */
 /* #undef ZFS_GENHD_FL_NO_PART */
 
 /* global_node_page_state() exists */
 /* #undef ZFS_GLOBAL_NODE_PAGE_STATE */
 
 /* global_zone_page_state() exists */
 /* #undef ZFS_GLOBAL_ZONE_PAGE_STATE */
 
 /* Define to 1 if GPL-only symbols can be used */
 /* #undef ZFS_IS_GPL_COMPATIBLE */
 
 /* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.2.5-FreeBSD_g33174af15"
+#define ZFS_META_ALIAS "zfs-2.2.6-FreeBSD_g33174af15"
 
 /* Define the project author. */
 #define ZFS_META_AUTHOR "OpenZFS"
 
 /* Define the project release date. */
 /* #undef ZFS_META_DATA */
 
 /* Define the maximum compatible kernel version. */
 #define ZFS_META_KVER_MAX "6.9"
 
 /* Define the minimum compatible kernel version. */
 #define ZFS_META_KVER_MIN "3.10"
 
 /* Define the project license. */
 #define ZFS_META_LICENSE "CDDL"
 
 /* Define the libtool library 'age' version information. */
 /* #undef ZFS_META_LT_AGE */
 
 /* Define the libtool library 'current' version information. */
 /* #undef ZFS_META_LT_CURRENT */
 
 /* Define the libtool library 'revision' version information. */
 /* #undef ZFS_META_LT_REVISION */
 
 /* Define the project name. */
 #define ZFS_META_NAME "zfs"
 
 /* Define the project release. */
 #define ZFS_META_RELEASE "FreeBSD_g33174af15"
 
 /* Define the project version. */
-#define ZFS_META_VERSION "2.2.5"
+#define ZFS_META_VERSION "2.2.6"
 
 /* count is located in percpu_ref.data */
 /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index dd06a2d01985..ba3f91705173 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define	ZFS_META_GITREV "zfs-2.2.5-0-g33174af15"
+#define	ZFS_META_GITREV "zfs-2.2.6-0-gbaa503145"