diff --git a/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py b/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py
index b49255e8381d..08021aabcb61 100755
--- a/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py
+++ b/sys/contrib/openzfs/.github/workflows/scripts/generate-ci-type.py
@@ -1,108 +1,108 @@
 #!/usr/bin/env python3
 
 """
 Determine the CI type based on the change list and commit message.
 
 Prints "quick" if (explicity required by user):
 - the *last* commit message contains 'ZFS-CI-Type: quick'
 or if (heuristics):
 - the files changed are not in the list of specified directories, and
 - all commit messages do not contain 'ZFS-CI-Type: full'
 
 Otherwise prints "full".
 """
 
 import sys
 import subprocess
 import re
 
 """
 Patterns of files that are not considered to trigger full CI.
 Note: not using pathlib.Path.match() because it does not support '**'
 """
 FULL_RUN_IGNORE_REGEX = list(map(re.compile, [
     r'.*\.md',
     r'.*\.gitignore'
 ]))
 
 """
 Patterns of files that are considered to trigger full CI.
 """
 FULL_RUN_REGEX = list(map(re.compile, [
     r'\.github/workflows/scripts/.*',
     r'cmd.*',
     r'configs/.*',
     r'META',
     r'.*\.am',
     r'.*\.m4',
     r'autogen\.sh',
     r'configure\.ac',
     r'copy-builtin',
     r'contrib',
     r'etc',
     r'include',
     r'lib/.*',
     r'module/.*',
     r'scripts/.*',
     r'tests/.*',
     r'udev/.*'
 ]))
 
 if __name__ == '__main__':
 
     prog = sys.argv[0]
 
     if len(sys.argv) != 3:
         print(f'Usage: {prog} <head_ref> <base_ref>')
         sys.exit(1)
 
     head, base = sys.argv[1:3]
 
     def output_type(type, reason):
         print(f'{prog}: will run {type} CI: {reason}', file=sys.stderr)
         print(type)
         sys.exit(0)
 
     # check last (HEAD) commit message
     last_commit_message_raw = subprocess.run([
-        'git', 'show', '-s', '--format=%B', 'HEAD'
+        'git', 'show', '-s', '--format=%B', head
     ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
     for line in last_commit_message_raw.stdout.decode().splitlines():
         if line.strip().lower() == 'zfs-ci-type: quick':
             output_type('quick', f'explicitly requested by HEAD commit {head}')
 
     # check all commit messages
     all_commit_message_raw = subprocess.run([
         'git', 'show', '-s',
         '--format=ZFS-CI-Commit: %H%n%B', f'{head}...{base}'
     ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     all_commit_message = all_commit_message_raw.stdout.decode().splitlines()
 
     commit_ref = head
     for line in all_commit_message:
         if line.startswith('ZFS-CI-Commit:'):
             commit_ref = line.lstrip('ZFS-CI-Commit:').rstrip()
         if line.strip().lower() == 'zfs-ci-type: full':
             output_type('full', f'explicitly requested by commit {commit_ref}')
 
     # check changed files
     changed_files_raw = subprocess.run([
         'git', 'diff', '--name-only', head, base
     ], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     changed_files = changed_files_raw.stdout.decode().splitlines()
 
     for f in changed_files:
         for r in FULL_RUN_IGNORE_REGEX:
             if r.match(f):
                 break
         else:
             for r in FULL_RUN_REGEX:
                 if r.match(f):
                     output_type(
                         'full',
                         f'changed file "{f}" matches pattern "{r.pattern}"'
                         )
 
     # catch-all
     output_type('quick', 'no changed file matches full CI patterns')
diff --git a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml
index 4ebb80af1f03..a5dbfc099c90 100644
--- a/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml
+++ b/sys/contrib/openzfs/.github/workflows/zfs-qemu.yml
@@ -1,175 +1,174 @@
 name: zfs-qemu
 
 on:
   push:
   pull_request:
   workflow_dispatch:
     inputs:
       fedora_kernel_ver:
         type: string
         required: false
         default: ""
         description: "(optional) Experimental kernel version to install on Fedora (like '6.14' or '6.13.3-0.rc3')"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
 jobs:
   test-config:
     name: Setup
     runs-on: ubuntu-24.04
     outputs:
       test_os: ${{ steps.os.outputs.os }}
       ci_type: ${{ steps.os.outputs.ci_type }}
     steps:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Generate OS config and CI type
         id: os
         run: |
           FULL_OS='["almalinux8", "almalinux9", "almalinux10", "centos-stream9", "centos-stream10", "debian12", "debian13", "fedora41", "fedora42", "freebsd13-5r", "freebsd14-3s", "freebsd15-0c", "ubuntu22", "ubuntu24"]'
           QUICK_OS='["almalinux8", "almalinux9", "almalinux10", "debian12", "fedora42", "freebsd14-3s", "ubuntu24"]'
           # determine CI type when running on PR
           ci_type="full"
           if ${{ github.event_name == 'pull_request' }}; then
             head=${{ github.event.pull_request.head.sha }}
             base=${{ github.event.pull_request.base.sha }}
             ci_type=$(python3 .github/workflows/scripts/generate-ci-type.py $head $base)
           fi
           if [ "$ci_type" == "quick" ]; then
             os_selection="$QUICK_OS"
           else
             os_selection="$FULL_OS"
           fi
 
-          if [ ${{ github.event.inputs.fedora_kernel_ver }} != "" ] ; then
+          if ${{ github.event.inputs.fedora_kernel_ver != '' }}; then
               # They specified a custom kernel version for Fedora.  Use only
               # Fedora runners.
               os_json=$(echo ${os_selection} | jq -c '[.[] | select(startswith("fedora"))]')
           else
               # Normal case
               os_json=$(echo ${os_selection} | jq -c)
           fi
 
-          echo $os_json
-          echo "os=$os_json" >> $GITHUB_OUTPUT
-          echo "ci_type=$ci_type" >> $GITHUB_OUTPUT
+          echo "os=$os_json" | tee -a $GITHUB_OUTPUT
+          echo "ci_type=$ci_type" | tee -a $GITHUB_OUTPUT
 
   qemu-vm:
     name: qemu-x86
     needs: [ test-config ]
     strategy:
       fail-fast: false
       matrix:
         # rhl:     almalinux8, almalinux9, centos-stream9, fedora4x
         # debian:  debian12, debian13, ubuntu22, ubuntu24
         # misc:    archlinux, tumbleweed
         # FreeBSD variants of 2025-06:
         # FreeBSD Release: freebsd13-5r, freebsd14-2r, freebsd14-3r
         # FreeBSD Stable:  freebsd13-5s, freebsd14-3s
         # FreeBSD Current: freebsd15-0c
         os: ${{ fromJson(needs.test-config.outputs.test_os) }}
     runs-on: ubuntu-24.04
     steps:
     - uses: actions/checkout@v4
       with:
         ref: ${{ github.event.pull_request.head.sha }}
 
     - name: Setup QEMU
-      timeout-minutes: 10
+      timeout-minutes: 15
       run: .github/workflows/scripts/qemu-1-setup.sh
 
     - name: Start build machine
       timeout-minutes: 10
       run: .github/workflows/scripts/qemu-2-start.sh ${{ matrix.os }}
 
     - name: Install dependencies
       timeout-minutes: 20
       run: .github/workflows/scripts/qemu-3-deps.sh ${{ matrix.os }} ${{ github.event.inputs.fedora_kernel_ver }}
 
     - name: Build modules
       timeout-minutes: 30
       run: .github/workflows/scripts/qemu-4-build.sh --poweroff --enable-debug ${{ matrix.os }}
 
     - name: Setup testing machines
       timeout-minutes: 5
       run: .github/workflows/scripts/qemu-5-setup.sh
 
     - name: Run tests
       timeout-minutes: 270
       run: .github/workflows/scripts/qemu-6-tests.sh
       env:
         CI_TYPE: ${{ needs.test-config.outputs.ci_type }}
 
     - name: Prepare artifacts
       if: always()
       timeout-minutes: 10
       run: .github/workflows/scripts/qemu-7-prepare.sh
 
     - uses: actions/upload-artifact@v4
       id: artifact-upload
       if: always()
       with:
         name: Logs-functional-${{ matrix.os }}
         path: /tmp/qemu-${{ matrix.os }}.tar
         if-no-files-found: ignore
 
     - name: Test Summary
       if: always()
       run: .github/workflows/scripts/qemu-8-summary.sh '${{ steps.artifact-upload.outputs.artifact-url }}'
 
   cleanup:
     if: always()
     name: Cleanup
     runs-on: ubuntu-latest
     needs: [ qemu-vm ]
 
     steps:
     - uses: actions/checkout@v4
       with:
         ref: ${{ github.event.pull_request.head.sha }}
     - uses: actions/download-artifact@v4
     - name: Generating summary
       run: .github/workflows/scripts/qemu-9-summary-page.sh
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 2
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 3
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 4
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 5
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 6
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 7
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 8
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 9
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 10
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 11
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 12
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 13
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 14
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 15
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 16
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 17
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 18
     - name: Generating summary...
       run: .github/workflows/scripts/qemu-9-summary-page.sh 19
     - uses: actions/upload-artifact@v4
       with:
         name: Summary Files
         path: out-*
diff --git a/sys/contrib/openzfs/META b/sys/contrib/openzfs/META
index 42f65290e4e3..5704b5c6de8a 100644
--- a/sys/contrib/openzfs/META
+++ b/sys/contrib/openzfs/META
@@ -1,10 +1,10 @@
 Meta:          1
 Name:          zfs
 Branch:        1.0
-Version:       2.4.0
-Release:       rc1
+Version:       2.4.99
+Release:       1
 Release-Tags:  relext
 License:       CDDL
 Author:        OpenZFS
 Linux-Maximum: 6.16
 Linux-Minimum: 4.18
diff --git a/sys/contrib/openzfs/Makefile.am b/sys/contrib/openzfs/Makefile.am
index 5f09d170e730..30f78e490b78 100644
--- a/sys/contrib/openzfs/Makefile.am
+++ b/sys/contrib/openzfs/Makefile.am
@@ -1,211 +1,215 @@
 CLEANFILES =
 dist_noinst_DATA =
 INSTALL_DATA_HOOKS =
+INSTALL_EXEC_HOOKS =
 ALL_LOCAL =
 CLEAN_LOCAL =
 CHECKS = shellcheck checkbashisms
 
 include $(top_srcdir)/config/Rules.am
 include $(top_srcdir)/config/CppCheck.am
 include $(top_srcdir)/config/Shellcheck.am
 include $(top_srcdir)/config/Substfiles.am
 include $(top_srcdir)/scripts/Makefile.am
 
 ACLOCAL_AMFLAGS = -I config
 
 SUBDIRS = include
 if BUILD_LINUX
 include $(srcdir)/%D%/rpm/Makefile.am
 endif
 
 if CONFIG_USER
 include $(srcdir)/%D%/cmd/Makefile.am
 include $(srcdir)/%D%/contrib/Makefile.am
 include $(srcdir)/%D%/etc/Makefile.am
 include $(srcdir)/%D%/lib/Makefile.am
 include $(srcdir)/%D%/man/Makefile.am
 include $(srcdir)/%D%/tests/Makefile.am
 if BUILD_LINUX
 include $(srcdir)/%D%/udev/Makefile.am
 endif
 endif
 CPPCHECKDIRS += module
 if CONFIG_KERNEL
 SUBDIRS += module
 
 extradir = $(prefix)/src/zfs-$(VERSION)
 extra_HEADERS = zfs.release.in zfs_config.h.in
 endif
 
 dist_noinst_DATA += autogen.sh copy-builtin
 dist_noinst_DATA += AUTHORS CODE_OF_CONDUCT.md COPYRIGHT LICENSE META NEWS NOTICE
 dist_noinst_DATA += README.md RELEASES.md
 dist_noinst_DATA += module/lua/README.zfs module/os/linux/spl/README.md
 
 # Include all the extra licensing information for modules
 dist_noinst_DATA += module/icp/algs/skein/THIRDPARTYLICENSE
 dist_noinst_DATA += module/icp/algs/skein/THIRDPARTYLICENSE.descrip
 dist_noinst_DATA += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman
 dist_noinst_DATA += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip
 dist_noinst_DATA += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl
 dist_noinst_DATA += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip
 dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams
 dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip
 dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl
 dist_noinst_DATA += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip
 dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2
 dist_noinst_DATA += module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip
 dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash
 dist_noinst_DATA += module/zfs/THIRDPARTYLICENSE.cityhash.descrip
 
 @CODE_COVERAGE_RULES@
 
 GITREV = include/zfs_gitrev.h
 CLEANFILES += $(GITREV)
 PHONY += gitrev
 gitrev:
 	$(AM_V_GEN)$(top_srcdir)/scripts/make_gitrev.sh $(GITREV)
 
 all: gitrev
 
 PHONY += install-data-hook $(INSTALL_DATA_HOOKS)
 install-data-hook: $(INSTALL_DATA_HOOKS)
 
+PHONY += install-exec-hook $(INSTALL_EXEC_HOOKS)
+install-exec-hook: $(INSTALL_EXEC_HOOKS)
+
 PHONY += maintainer-clean-local
 maintainer-clean-local:
 	-$(RM) $(GITREV)
 
 PHONY += distclean-local
 distclean-local:
 	-$(RM) -R autom4te*.cache build
 	-find . \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS \
 		-o -name .pc -o -name .hg -o -name .git \) -prune -o \
 		\( -name '*.orig' -o -name '*.rej' -o -name '*~' \
 		-o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \
 		-o -name '.*.rej' -o -size 0 -o -name '*%' -o -name '.*.cmd' \
 		-o -name 'core' -o -name 'Makefile' -o -name 'Module.symvers' \
 		-o -name '*.order' -o -name '*.markers' -o -name '*.gcda' \
 		-o -name '*.gcno' \) \
 		-type f -delete
 
 PHONY += $(CLEAN_LOCAL)
 clean-local: $(CLEAN_LOCAL)
 
 PHONY += $(ALL_LOCAL)
 all-local: $(ALL_LOCAL)
 
 dist-hook:
 	$(top_srcdir)/scripts/make_gitrev.sh -D $(distdir) $(GITREV)
 	$(SED) $(ac_inplace) 's/\(Release:[[:space:]]*\).*/\1$(RELEASE)/' $(distdir)/META
 
 PHONY += codecheck $(CHECKS)
 codecheck: $(CHECKS)
 
 SHELLCHECKSCRIPTS += autogen.sh
 
 PHONY += checkstyle
 checkstyle: codecheck commitcheck
 
 PHONY += commitcheck
 commitcheck:
 	$(AM_V_at)if git rev-parse --git-dir > /dev/null 2>&1; then \
 		${top_srcdir}/scripts/commitcheck.sh; \
 	fi
 
 CHECKS += spdxcheck
 spdxcheck:
 	$(AM_V_at)$(top_srcdir)/scripts/spdxcheck.pl
 
 if HAVE_PARALLEL
 cstyle_line = -print0 | parallel -X0 ${top_srcdir}/scripts/cstyle.pl -cpP {}
 else
 cstyle_line = -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} +
 endif
 CHECKS += cstyle
 cstyle:
 	$(AM_V_at)find $(top_srcdir) -name build -prune \
 		-o -type f -name '*.[hc]' \
 		! -name 'zfs_config.*' ! -name '*.mod.c' \
 		! -name 'opt_global.h' ! -name '*_if*.h' \
 		! -name 'zstd_compat_wrapper.h' \
 		! -path './module/zstd/lib/*' \
 		! -path './include/sys/lua/*' \
 		! -path './module/lua/l*.[ch]' \
 		! -path './module/zfs/lz4.c' \
 		$(cstyle_line)
 
 filter_executable = -exec test -x '{}' \; -print
 CHECKS += testscheck
 testscheck:
 	$(AM_V_at)[ $$(find $(top_srcdir)/tests/zfs-tests -type f \
 		\( -name '*.ksh' -not $(filter_executable) \) -o \
 		\( -name '*.kshlib' $(filter_executable) \) -o \
 		\( -name '*.shlib' $(filter_executable) \) -o \
 		\( -name '*.cfg' $(filter_executable) \) | \
 		tee /dev/stderr | wc -l) -eq 0 ]
 
 CHECKS += vcscheck
 vcscheck:
 	$(AM_V_at)if git rev-parse --git-dir > /dev/null 2>&1; then \
 		git ls-files . --exclude-standard --others | \
 		awk '{c++; print} END {if(c>0) exit 1}' ; \
 	fi
 
 CHECKS += zstdcheck
 zstdcheck:
 	@$(MAKE) -C module check-zstd-symbols
 
 PHONY += lint
 lint: cppcheck paxcheck
 
 PHONY += paxcheck
 paxcheck:
 	$(AM_V_at)if type scanelf > /dev/null 2>&1; then \
 		$(top_srcdir)/scripts/paxcheck.sh $(top_builddir); \
 	else \
 		echo "skipping paxcheck because scanelf is not installed"; \
 	fi
 
 CHECKS += flake8
 flake8:
 	$(AM_V_at)if type flake8 > /dev/null 2>&1; then \
 		flake8 $(top_srcdir); \
 	else \
 		echo "skipping flake8 because flake8 is not installed"; \
 	fi
 
 PHONY += regen-tests
 regen-tests:
 	@$(MAKE) -C tests/zfs-tests/tests regen
 
 PHONY += ctags
 ctags:
 	$(RM) tags
 	find $(top_srcdir) -name '.?*' -prune \
 		-o -type f -name '*.[hcS]' -exec ctags -a {} +
 
 PHONY += etags
 etags:
 	$(RM) TAGS
 	find $(top_srcdir) -name '.?*' -prune \
 		-o -type f -name '*.[hcS]' -exec etags -a {} +
 
 PHONY += cscopelist
 cscopelist:
 	find $(top_srcdir) -name '.?*' -prune \
 		-o -type f -name '*.[hc]' -print >cscope.files
 
 PHONY += tags
 tags: ctags etags
 
 PHONY += pkg pkg-dkms pkg-kmod pkg-utils
 pkg: @DEFAULT_PACKAGE@
 pkg-dkms: @DEFAULT_PACKAGE@-dkms
 pkg-kmod: @DEFAULT_PACKAGE@-kmod
 pkg-utils: @DEFAULT_PACKAGE@-utils
 
 include config/rpm.am
 include config/deb.am
 include config/tgz.am
 
 .PHONY: $(PHONY)
diff --git a/sys/contrib/openzfs/cmd/Makefile.am b/sys/contrib/openzfs/cmd/Makefile.am
index 96040976e53e..e79bfae2b10f 100644
--- a/sys/contrib/openzfs/cmd/Makefile.am
+++ b/sys/contrib/openzfs/cmd/Makefile.am
@@ -1,114 +1,118 @@
 bin_SCRIPTS =
 bin_PROGRAMS =
 sbin_SCRIPTS =
 sbin_PROGRAMS =
 dist_bin_SCRIPTS =
 zfsexec_PROGRAMS =
 mounthelper_PROGRAMS =
 
 
 sbin_SCRIPTS      += fsck.zfs
 SHELLCHECKSCRIPTS += fsck.zfs
 CLEANFILES        += fsck.zfs
 dist_noinst_DATA  += %D%/fsck.zfs.in
 $(call SUBST,fsck.zfs,%D%/)
 
 
 sbin_PROGRAMS   += zfs_ids_to_path
 CPPCHECKTARGETS += zfs_ids_to_path
 
 zfs_ids_to_path_SOURCES = \
 	%D%/zfs_ids_to_path.c
 
 zfs_ids_to_path_LDADD = \
 	libzfs.la
 
 
 zhack_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += zhack
 CPPCHECKTARGETS += zhack
 
 zhack_SOURCES = \
 	%D%/zhack.c
 
 zhack_LDADD = \
 	libzpool.la \
 	libzfs_core.la \
 	libnvpair.la
 
 
 ztest_CFLAGS    = $(AM_CFLAGS) $(KERNEL_CFLAGS)
 ztest_CPPFLAGS  = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += ztest
 CPPCHECKTARGETS += ztest
 
 ztest_SOURCES = \
 	%D%/ztest.c
 
 ztest_LDADD = \
 	libzpool.la \
 	libzfs_core.la \
 	libnvpair.la
 
 ztest_LDADD += -lm
 ztest_LDFLAGS = -pthread
 
 
 include $(srcdir)/%D%/raidz_test/Makefile.am
 include $(srcdir)/%D%/zdb/Makefile.am
 include $(srcdir)/%D%/zfs/Makefile.am
 include $(srcdir)/%D%/zinject/Makefile.am
 include $(srcdir)/%D%/zpool/Makefile.am
 include $(srcdir)/%D%/zpool_influxdb/Makefile.am
 include $(srcdir)/%D%/zstream/Makefile.am
 
 
 if BUILD_LINUX
 mounthelper_PROGRAMS += mount.zfs
 CPPCHECKTARGETS      += mount.zfs
 
 mount_zfs_SOURCES = \
 	%D%/mount_zfs.c
 
 mount_zfs_LDADD = \
 	libzfs.la \
 	libzfs_core.la \
 	libnvpair.la
 
 mount_zfs_LDADD += $(LTLIBINTL)
 
 CPPCHECKTARGETS += raidz_test
 
 
 sbin_PROGRAMS   += zgenhostid
 CPPCHECKTARGETS += zgenhostid
 
 zgenhostid_SOURCES = \
 	%D%/zgenhostid.c
 
 
 dist_bin_SCRIPTS  += %D%/zvol_wait
 SHELLCHECKSCRIPTS += %D%/zvol_wait
 
 
 include $(srcdir)/%D%/zed/Makefile.am
 endif
 
 
 if USING_PYTHON
 bin_SCRIPTS      += arc_summary     arcstat        dbufstat        zilstat
 CLEANFILES       += arc_summary     arcstat        dbufstat        zilstat
 dist_noinst_DATA += %D%/arc_summary %D%/arcstat.in %D%/dbufstat.in %D%/zilstat.in
 
 $(call SUBST,arcstat,%D%/)
 $(call SUBST,dbufstat,%D%/)
 $(call SUBST,zilstat,%D%/)
 arc_summary: %D%/arc_summary
 	$(AM_V_at)cp $< $@
-endif
 
+cmd-rename-install-exec-hook:
+	$(LN_S) -f arcstat $(DESTDIR)$(bindir)/zarcstat
+	$(LN_S) -f arc_summary $(DESTDIR)$(bindir)/zarcsummary
+INSTALL_EXEC_HOOKS += cmd-rename-install-exec-hook
+endif
 
 PHONY += cmd
 cmd: $(bin_SCRIPTS) $(bin_PROGRAMS) $(sbin_SCRIPTS) $(sbin_PROGRAMS) $(dist_bin_SCRIPTS) $(zfsexec_PROGRAMS) $(mounthelper_PROGRAMS)
diff --git a/sys/contrib/openzfs/cmd/arc_summary b/sys/contrib/openzfs/cmd/arc_summary
index e60c6b64e8a1..9538dd599cb7 100755
--- a/sys/contrib/openzfs/cmd/arc_summary
+++ b/sys/contrib/openzfs/cmd/arc_summary
@@ -1,1073 +1,1080 @@
 #!/usr/bin/env python3
 # SPDX-License-Identifier: BSD-2-Clause
 #
 # Copyright (c) 2008 Ben Rockwood <benr@cuddletech.com>,
 # Copyright (c) 2010 Martin Matuska <mm@FreeBSD.org>,
 # Copyright (c) 2010-2011 Jason J. Hellenthal <jhell@DataIX.net>,
 # Copyright (c) 2017 Scot W. Stevenson <scot.stevenson@gmail.com>
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 #
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 """Print statistics on the ZFS ARC Cache and other information
 
 Provides basic information on the ARC, its efficiency, the L2ARC (if present),
 the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See
 the in-source documentation and code at
 https://github.com/openzfs/zfs/blob/master/module/zfs/arc.c for details.
 The original introduction to arc_summary can be found at
 http://cuddletech.com/?p=454
 """
 
 import argparse
 import os
 import subprocess
 import sys
 import time
 import errno
 
 # We can't use env -S portably, and we need python3 -u to handle pipes in
 # the shell abruptly closing the way we want to, so...
 import io
 if isinstance(sys.__stderr__.buffer, io.BufferedWriter):
     os.execv(sys.executable, [sys.executable, "-u"] + sys.argv)
 
 DESCRIPTION = 'Print ARC and other statistics for OpenZFS'
 INDENT = ' '*8
 LINE_LENGTH = 72
 DATE_FORMAT = '%a %b %d %H:%M:%S %Y'
 TITLE = 'ZFS Subsystem Report'
 
 SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split()
 SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')'
 
 # Tunables and SPL are handled separately because they come from
 # different sources
 SECTION_PATHS = {'arc': 'arcstats',
                  'dmu': 'dmu_tx',
                  'l2arc': 'arcstats',  # L2ARC stuff lives in arcstats
                  'zfetch': 'zfetchstats',
                  'zil': 'zil'}
 
 parser = argparse.ArgumentParser(description=DESCRIPTION)
 parser.add_argument('-a', '--alternate', action='store_true', default=False,
                     help='use alternate formatting for tunables and SPL',
                     dest='alt')
 parser.add_argument('-d', '--description', action='store_true', default=False,
                     help='print descriptions with tunables and SPL',
                     dest='desc')
 parser.add_argument('-g', '--graph', action='store_true', default=False,
                     help='print graph on ARC use and exit', dest='graph')
 parser.add_argument('-p', '--page', type=int, dest='page',
                     help='print page by number (DEPRECATED, use "-s")')
 parser.add_argument('-r', '--raw', action='store_true', default=False,
                     help='dump all available data with minimal formatting',
                     dest='raw')
 parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP)
 ARGS = parser.parse_args()
 
 
 if sys.platform.startswith('freebsd'):
     # Requires py36-sysctl on FreeBSD
     import sysctl
 
     def is_value(ctl):
         return ctl.type != sysctl.CTLTYPE_NODE
 
     def namefmt(ctl, base='vfs.zfs.'):
         # base is removed from the name
         cut = len(base)
         return ctl.name[cut:]
 
     def load_kstats(section):
         base = 'kstat.zfs.misc.{section}.'.format(section=section)
         fmt = lambda kstat: '{name} : {value}'.format(name=namefmt(kstat, base),
                                                       value=kstat.value)
         kstats = sysctl.filter(base)
         return [fmt(kstat) for kstat in kstats if is_value(kstat)]
 
     def get_params(base):
         ctls = sysctl.filter(base)
         return {namefmt(ctl): str(ctl.value) for ctl in ctls if is_value(ctl)}
 
     def get_tunable_params():
         return get_params('vfs.zfs')
 
     def get_vdev_params():
         return get_params('vfs.zfs.vdev')
 
     def get_version_impl(request):
         # FreeBSD reports versions for zpl and spa instead of zfs and spl.
         name = {'zfs': 'zpl',
                 'spl': 'spa'}[request]
         mib = 'vfs.zfs.version.{}'.format(name)
         version = sysctl.filter(mib)[0].value
         return '{} version {}'.format(name, version)
 
     def get_descriptions(_request):
         ctls = sysctl.filter('vfs.zfs')
         return {namefmt(ctl): ctl.description for ctl in ctls if is_value(ctl)}
 
 
 elif sys.platform.startswith('linux'):
     KSTAT_PATH = '/proc/spl/kstat/zfs'
     SPL_PATH = '/sys/module/spl/parameters'
     TUNABLES_PATH = '/sys/module/zfs/parameters'
 
     def load_kstats(section):
         path = os.path.join(KSTAT_PATH, section)
         with open(path) as f:
             return list(f)[2:] # Get rid of header
 
     def get_params(basepath):
         """Collect information on the Solaris Porting Layer (SPL) or the
         tunables, depending on the PATH given. Does not check if PATH is
         legal.
         """
         result = {}
         for name in os.listdir(basepath):
             path = os.path.join(basepath, name)
             with open(path) as f:
                 value = f.read()
                 result[name] = value.strip()
         return result
 
     def get_spl_params():
         return get_params(SPL_PATH)
 
     def get_tunable_params():
         return get_params(TUNABLES_PATH)
 
     def get_vdev_params():
         return get_params(TUNABLES_PATH)
 
     def get_version_impl(request):
         # The original arc_summary called /sbin/modinfo/{spl,zfs} to get
         # the version information. We switch to /sys/module/{spl,zfs}/version
         # to make sure we get what is really loaded in the kernel
         try:
             with open("/sys/module/{}/version".format(request)) as f:
                 return f.read().strip()
         except:
             return "(unknown)"
 
     def get_descriptions(request):
         """Get the descriptions of the Solaris Porting Layer (SPL) or the
         tunables, return with minimal formatting.
         """
 
         if request not in ('spl', 'zfs'):
             print('ERROR: description of "{0}" requested)'.format(request))
             sys.exit(1)
 
         descs = {}
         target_prefix = 'parm:'
 
         # We would prefer to do this with /sys/modules -- see the discussion at
         # get_version() -- but there isn't a way to get the descriptions from
         # there, so we fall back on modinfo
         command = ["/sbin/modinfo", request, "-0"]
 
         info = ''
 
         try:
 
             info = subprocess.run(command, stdout=subprocess.PIPE,
                                   check=True, universal_newlines=True)
             raw_output = info.stdout.split('\0')
 
         except subprocess.CalledProcessError:
             print("Error: Descriptions not available",
                   "(can't access kernel module)")
             sys.exit(1)
 
         for line in raw_output:
 
             if not line.startswith(target_prefix):
                 continue
 
             line = line[len(target_prefix):].strip()
             name, raw_desc = line.split(':', 1)
             desc = raw_desc.rsplit('(', 1)[0]
 
             if desc == '':
                 desc = '(No description found)'
 
             descs[name.strip()] = desc.strip()
 
         return descs
 
 def handle_unraisableException(exc_type, exc_value=None, exc_traceback=None,
                                err_msg=None, object=None):
    handle_Exception(exc_type, object, exc_traceback)
 
 def handle_Exception(ex_cls, ex, tb):
     if ex_cls is KeyboardInterrupt:
         sys.exit()
 
     if ex_cls is BrokenPipeError:
         # It turns out that while sys.exit() triggers an exception
         # not handled message on Python 3.8+, os._exit() does not.
         os._exit(0)
 
     if ex_cls is OSError:
       if ex.errno == errno.ENOTCONN:
         sys.exit()
 
     raise ex
 
 if hasattr(sys,'unraisablehook'): # Python 3.8+
     sys.unraisablehook = handle_unraisableException
 sys.excepthook = handle_Exception
 
 
 def cleanup_line(single_line):
     """Format a raw line of data from /proc and isolate the name value
     part, returning a tuple with each. Currently, this gets rid of the
     middle '4'. For example "arc_no_grow    4    0" returns the tuple
     ("arc_no_grow", "0").
     """
     name, _, value = single_line.split()
 
     return name, value
 
 
 def draw_graph(kstats_dict):
     """Draw a primitive graph representing the basic information on the
     ARC -- its size and the proportion used by MFU and MRU -- and quit.
     We use max size of the ARC to calculate how full it is. This is a
     very rough representation.
     """
 
     arc_stats = isolate_section('arcstats', kstats_dict)
 
     GRAPH_INDENT = ' '*4
     GRAPH_WIDTH = 70
     arc_max = int(arc_stats['c_max'])
     arc_size = f_bytes(arc_stats['size'])
     arc_perc = f_perc(arc_stats['size'], arc_max)
     data_size = f_bytes(arc_stats['data_size'])
     meta_size = f_bytes(arc_stats['metadata_size'])
     dnode_size = f_bytes(arc_stats['dnode_size'])
 
     info_form = ('ARC: {0} ({1}) Data: {2} Meta: {3} Dnode: {4}')
     info_line = info_form.format(arc_size, arc_perc, data_size, meta_size,
                                  dnode_size)
     info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2)
     info_line = GRAPH_INDENT+info_spc+info_line
 
     graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+'
 
     arc_perc = float(int(arc_stats['size'])/arc_max)
     data_perc = float(int(arc_stats['data_size'])/arc_max)
     meta_perc = float(int(arc_stats['metadata_size'])/arc_max)
     dnode_perc = float(int(arc_stats['dnode_size'])/arc_max)
     total_ticks = float(arc_perc)*GRAPH_WIDTH
     data_ticks = data_perc*GRAPH_WIDTH
     meta_ticks = meta_perc*GRAPH_WIDTH
     dnode_ticks = dnode_perc*GRAPH_WIDTH
     other_ticks = total_ticks-(data_ticks+meta_ticks+dnode_ticks)
 
     core_form = 'D'*int(data_ticks)+'M'*int(meta_ticks)+'N'*int(dnode_ticks)+\
         'O'*int(other_ticks)
     core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form)))
     core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|'
 
     for line in ('', info_line, graph_line, core_line, graph_line, ''):
         print(line)
 
 
 def f_bytes(byte_string):
     """Return human-readable representation of a byte value in
     powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal
     points. Values smaller than one KiB are returned without
     decimal points. Note "bytes" is a reserved keyword.
     """
 
     prefixes = ([2**80, "YiB"],   # yobibytes (yotta)
                 [2**70, "ZiB"],   # zebibytes (zetta)
                 [2**60, "EiB"],   # exbibytes (exa)
                 [2**50, "PiB"],   # pebibytes (peta)
                 [2**40, "TiB"],   # tebibytes (tera)
                 [2**30, "GiB"],   # gibibytes (giga)
                 [2**20, "MiB"],   # mebibytes (mega)
                 [2**10, "KiB"])   # kibibytes (kilo)
 
     bites = int(byte_string)
 
     if bites >= 2**10:
         for limit, unit in prefixes:
 
             if bites >= limit:
                 value = bites / limit
                 break
 
         result = '{0:.1f} {1}'.format(value, unit)
     else:
         result = '{0} Bytes'.format(bites)
 
     return result
 
 
 def f_hits(hits_string):
     """Create a human-readable representation of the number of hits.
     The single-letter symbols used are SI to avoid the confusion caused
     by the different "short scale" and "long scale" representations in
     English, which use the same words for different values. See
     https://en.wikipedia.org/wiki/Names_of_large_numbers and:
     https://physics.nist.gov/cuu/Units/prefixes.html
     """
 
     numbers = ([10**24, 'Y'],  # yotta (septillion)
                [10**21, 'Z'],  # zetta (sextillion)
                [10**18, 'E'],  # exa   (quintrillion)
                [10**15, 'P'],  # peta  (quadrillion)
                [10**12, 'T'],  # tera  (trillion)
                [10**9, 'G'],   # giga  (billion)
                [10**6, 'M'],   # mega  (million)
                [10**3, 'k'])   # kilo  (thousand)
 
     hits = int(hits_string)
 
     if hits >= 1000:
         for limit, symbol in numbers:
 
             if hits >= limit:
                 value = hits/limit
                 break
 
         result = "%0.1f%s" % (value, symbol)
     else:
         result = "%d" % hits
 
     return result
 
 
 def f_perc(value1, value2):
     """Calculate percentage and return in human-readable form. If
     rounding produces the result '0.0' though the first number is
     not zero, include a 'less-than' symbol to avoid confusion.
     Division by zero is handled by returning 'n/a'; no error
     is called.
     """
 
     v1 = float(value1)
     v2 = float(value2)
 
     try:
         perc = 100 * v1/v2
     except ZeroDivisionError:
         result = 'n/a'
     else:
         result = '{0:0.1f} %'.format(perc)
 
     if result == '0.0 %' and v1 > 0:
         result = '< 0.1 %'
 
     return result
 
 
 def format_raw_line(name, value):
     """For the --raw option for the tunable and SPL outputs, decide on the
     correct formatting based on the --alternate flag.
     """
 
     if ARGS.alt:
         result = '{0}{1}={2}'.format(INDENT, name, value)
     else:
         # Right-align the value within the line length if it fits,
         # otherwise just separate it from the name by a single space.
         fit = LINE_LENGTH - len(INDENT) - len(name)
         overflow = len(value) + 1
         w = max(fit, overflow)
         result = '{0}{1}{2:>{w}}'.format(INDENT, name, value, w=w)
 
     return result
 
 
 def get_kstats():
     """Collect information on the ZFS subsystem. The step does not perform any
     further processing, giving us the option to only work on what is actually
     needed. The name "kstat" is a holdover from the Solaris utility of the same
     name.
     """
 
     result = {}
 
     for section in SECTION_PATHS.values():
         if section not in result:
             result[section] = load_kstats(section)
 
     return result
 
 
 def get_version(request):
     """Get the version number of ZFS or SPL on this machine for header.
     Returns an error string, but does not raise an error, if we can't
     get the ZFS/SPL version.
     """
 
     if request not in ('spl', 'zfs'):
         error_msg = '(ERROR: "{0}" requested)'.format(request)
         return error_msg
 
     return get_version_impl(request)
 
 
 def print_header():
     """Print the initial heading with date and time as well as info on the
     kernel and ZFS versions. This is not called for the graph.
     """
 
     # datetime is now recommended over time but we keep the exact formatting
     # from the older version of arc_summary in case there are scripts
     # that expect it in this way
     daydate = time.strftime(DATE_FORMAT)
     spc_date = LINE_LENGTH-len(daydate)
     sys_version = os.uname()
 
     sys_msg = sys_version.sysname+' '+sys_version.release
     zfs = get_version('zfs')
     spc_zfs = LINE_LENGTH-len(zfs)
 
     machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')'
     spl = get_version('spl')
     spc_spl = LINE_LENGTH-len(spl)
 
     print('\n'+('-'*LINE_LENGTH))
     print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date))
     print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs))
     print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl))
 
 
 def print_raw(kstats_dict):
     """Print all available data from the system in a minimally sorted format.
     This can be used as a source to be piped through 'grep'.
     """
 
     sections = sorted(kstats_dict.keys())
 
     for section in sections:
 
         print('\n{0}:'.format(section.upper()))
         lines = sorted(kstats_dict[section])
 
         for line in lines:
             name, value = cleanup_line(line)
             print(format_raw_line(name, value))
 
     # Tunables and SPL must be handled separately because they come from a
     # different source and have descriptions the user might request
     print()
     section_spl()
     section_tunables()
 
 
 def isolate_section(section_name, kstats_dict):
     """From the complete information on all sections, retrieve only those
     for one section.
     """
 
     try:
         section_data = kstats_dict[section_name]
     except KeyError:
         print('ERROR: Data on {0} not available'.format(section_data))
         sys.exit(1)
 
     section_dict = dict(cleanup_line(l) for l in section_data)
 
     return section_dict
 
 
 # Formatted output helper functions
 
 
 def prt_1(text, value):
     """Print text and one value, no indent"""
     spc = ' '*(LINE_LENGTH-(len(text)+len(value)))
     print('{0}{spc}{1}'.format(text, value, spc=spc))
 
 
 def prt_i1(text, value):
     """Print text and one value, with indent"""
     spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value)))
     print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc))
 
 
 def prt_2(text, value1, value2):
     """Print text and two values, no indent"""
     values = '{0:>9}  {1:>9}'.format(value1, value2)
     spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2))
     print('{0}{spc}  {1}'.format(text, values, spc=spc))
 
 
 def prt_i2(text, value1, value2):
     """Print text and two values, with indent"""
     values = '{0:>9}  {1:>9}'.format(value1, value2)
     spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2))
     print(INDENT+'{0}{spc}  {1}'.format(text, values, spc=spc))
 
 
 # The section output concentrates on important parameters instead of
 # being exhaustive (that is what the --raw parameter is for)
 
 
 def section_arc(kstats_dict):
     """Give basic information on the ARC, MRU and MFU. This is the first
     and most used section.
     """
 
     arc_stats = isolate_section('arcstats', kstats_dict)
 
     memory_all = arc_stats['memory_all_bytes']
     memory_free = arc_stats['memory_free_bytes']
     memory_avail = arc_stats['memory_available_bytes']
     arc_size = arc_stats['size']
     arc_target_size = arc_stats['c']
     arc_max = arc_stats['c_max']
     arc_min = arc_stats['c_min']
     dnode_limit = arc_stats['arc_dnode_limit']
 
     print('ARC status:')
     prt_i1('Total memory size:', f_bytes(memory_all))
     prt_i2('Min target size:', f_perc(arc_min, memory_all), f_bytes(arc_min))
     prt_i2('Max target size:', f_perc(arc_max, memory_all), f_bytes(arc_max))
     prt_i2('Target size (adaptive):',
            f_perc(arc_size, arc_max), f_bytes(arc_target_size))
     prt_i2('Current size:', f_perc(arc_size, arc_max), f_bytes(arc_size))
     prt_i1('Free memory size:', f_bytes(memory_free))
     prt_i1('Available memory size:', f_bytes(memory_avail))
     print()
 
     compressed_size = arc_stats['compressed_size']
     uncompressed_size = arc_stats['uncompressed_size']
     overhead_size = arc_stats['overhead_size']
     bonus_size = arc_stats['bonus_size']
     dnode_size = arc_stats['dnode_size']
     dbuf_size = arc_stats['dbuf_size']
     hdr_size = arc_stats['hdr_size']
     l2_hdr_size = arc_stats['l2_hdr_size']
     abd_chunk_waste_size = arc_stats['abd_chunk_waste_size']
 
     prt_1('ARC structural breakdown (current size):', f_bytes(arc_size))
     prt_i2('Compressed size:',
            f_perc(compressed_size, arc_size), f_bytes(compressed_size))
     prt_i2('Overhead size:',
            f_perc(overhead_size, arc_size), f_bytes(overhead_size))
     prt_i2('Bonus size:',
            f_perc(bonus_size, arc_size), f_bytes(bonus_size))
     prt_i2('Dnode size:',
            f_perc(dnode_size, arc_size), f_bytes(dnode_size))
     prt_i2('Dbuf size:',
            f_perc(dbuf_size, arc_size), f_bytes(dbuf_size))
     prt_i2('Header size:',
            f_perc(hdr_size, arc_size), f_bytes(hdr_size))
     prt_i2('L2 header size:',
            f_perc(l2_hdr_size, arc_size), f_bytes(l2_hdr_size))
     prt_i2('ABD chunk waste size:',
            f_perc(abd_chunk_waste_size, arc_size), f_bytes(abd_chunk_waste_size))
     print()
 
     meta = arc_stats['meta']
     pd = arc_stats['pd']
     pm = arc_stats['pm']
     data_size = arc_stats['data_size']
     metadata_size = arc_stats['metadata_size']
     anon_data = arc_stats['anon_data']
     anon_metadata = arc_stats['anon_metadata']
     mfu_data = arc_stats['mfu_data']
     mfu_metadata = arc_stats['mfu_metadata']
     mfu_edata = arc_stats['mfu_evictable_data']
     mfu_emetadata = arc_stats['mfu_evictable_metadata']
     mru_data = arc_stats['mru_data']
     mru_metadata = arc_stats['mru_metadata']
     mru_edata = arc_stats['mru_evictable_data']
     mru_emetadata = arc_stats['mru_evictable_metadata']
     mfug_data = arc_stats['mfu_ghost_data']
     mfug_metadata = arc_stats['mfu_ghost_metadata']
     mrug_data = arc_stats['mru_ghost_data']
     mrug_metadata = arc_stats['mru_ghost_metadata']
     unc_data = arc_stats['uncached_data']
     unc_metadata = arc_stats['uncached_metadata']
     caches_size = int(anon_data)+int(anon_metadata)+\
         int(mfu_data)+int(mfu_metadata)+int(mru_data)+int(mru_metadata)+\
         int(unc_data)+int(unc_metadata)
 
     prt_1('ARC types breakdown (compressed + overhead):', f_bytes(caches_size))
     prt_i2('Data size:',
            f_perc(data_size, caches_size), f_bytes(data_size))
     prt_i2('Metadata size:',
            f_perc(metadata_size, caches_size), f_bytes(metadata_size))
     print()
 
     prt_1('ARC states breakdown (compressed + overhead):', f_bytes(caches_size))
     prt_i2('Anonymous data size:',
            f_perc(anon_data, caches_size), f_bytes(anon_data))
     prt_i2('Anonymous metadata size:',
            f_perc(anon_metadata, caches_size), f_bytes(anon_metadata))
     s = 4294967296
     v = (s-int(pd))*(s-int(meta))/s
     prt_i2('MFU data target:', f_perc(v, s),
         f_bytes(v / 65536 * caches_size / 65536))
     prt_i2('MFU data size:',
            f_perc(mfu_data, caches_size), f_bytes(mfu_data))
     prt_i2('MFU evictable data size:',
            f_perc(mfu_edata, caches_size), f_bytes(mfu_edata))
     prt_i1('MFU ghost data size:', f_bytes(mfug_data))
     v = (s-int(pm))*int(meta)/s
     prt_i2('MFU metadata target:', f_perc(v, s),
         f_bytes(v / 65536 * caches_size / 65536))
     prt_i2('MFU metadata size:',
            f_perc(mfu_metadata, caches_size), f_bytes(mfu_metadata))
     prt_i2('MFU evictable metadata size:',
            f_perc(mfu_emetadata, caches_size), f_bytes(mfu_emetadata))
     prt_i1('MFU ghost metadata size:', f_bytes(mfug_metadata))
     v = int(pd)*(s-int(meta))/s
     prt_i2('MRU data target:', f_perc(v, s),
         f_bytes(v / 65536 * caches_size / 65536))
     prt_i2('MRU data size:',
            f_perc(mru_data, caches_size), f_bytes(mru_data))
     prt_i2('MRU evictable data size:',
            f_perc(mru_edata, caches_size), f_bytes(mru_edata))
     prt_i1('MRU ghost data size:', f_bytes(mrug_data))
     v = int(pm)*int(meta)/s
     prt_i2('MRU metadata target:', f_perc(v, s),
         f_bytes(v / 65536 * caches_size / 65536))
     prt_i2('MRU metadata size:',
            f_perc(mru_metadata, caches_size), f_bytes(mru_metadata))
     prt_i2('MRU evictable metadata size:',
            f_perc(mru_emetadata, caches_size), f_bytes(mru_emetadata))
     prt_i1('MRU ghost metadata size:', f_bytes(mrug_metadata))
     prt_i2('Uncached data size:',
            f_perc(unc_data, caches_size), f_bytes(unc_data))
     prt_i2('Uncached metadata size:',
            f_perc(unc_metadata, caches_size), f_bytes(unc_metadata))
     print()
 
     print('ARC hash breakdown:')
     prt_i1('Elements:', f_hits(arc_stats['hash_elements']))
     prt_i1('Collisions:', f_hits(arc_stats['hash_collisions']))
 
     prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max']))
     prt_i1('Chains:', f_hits(arc_stats['hash_chains']))
     print()
 
     print('ARC misc:')
     prt_i2('Uncompressed size:', f_perc(uncompressed_size, compressed_size),
            f_bytes(uncompressed_size))
     prt_i1('Memory throttles:', arc_stats['memory_throttle_count'])
     prt_i1('Memory direct reclaims:', arc_stats['memory_direct_count'])
     prt_i1('Memory indirect reclaims:', arc_stats['memory_indirect_count'])
     prt_i1('Deleted:', f_hits(arc_stats['deleted']))
     prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss']))
     prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip']))
     prt_i1('Eviction skips due to L2 writes:',
            f_hits(arc_stats['evict_l2_skip']))
     prt_i1('L2 cached evictions:', f_bytes(arc_stats['evict_l2_cached']))
     prt_i1('L2 eligible evictions:', f_bytes(arc_stats['evict_l2_eligible']))
     prt_i2('L2 eligible MFU evictions:',
            f_perc(arc_stats['evict_l2_eligible_mfu'],
            arc_stats['evict_l2_eligible']),
            f_bytes(arc_stats['evict_l2_eligible_mfu']))
     prt_i2('L2 eligible MRU evictions:',
            f_perc(arc_stats['evict_l2_eligible_mru'],
            arc_stats['evict_l2_eligible']),
            f_bytes(arc_stats['evict_l2_eligible_mru']))
     prt_i1('L2 ineligible evictions:',
            f_bytes(arc_stats['evict_l2_ineligible']))
     print()
 
 
 def section_archits(kstats_dict):
     """Print information on how the caches are accessed ("arc hits").
     """
 
     arc_stats = isolate_section('arcstats', kstats_dict)
     all_accesses = int(arc_stats['hits'])+int(arc_stats['iohits'])+\
         int(arc_stats['misses'])
 
     prt_1('ARC total accesses:', f_hits(all_accesses))
     ta_todo = (('Total hits:', arc_stats['hits']),
                ('Total I/O hits:', arc_stats['iohits']),
                ('Total misses:', arc_stats['misses']))
     for title, value in ta_todo:
         prt_i2(title, f_perc(value, all_accesses), f_hits(value))
     print()
 
     dd_total = int(arc_stats['demand_data_hits']) +\
         int(arc_stats['demand_data_iohits']) +\
         int(arc_stats['demand_data_misses'])
     prt_2('ARC demand data accesses:', f_perc(dd_total, all_accesses),
          f_hits(dd_total))
     dd_todo = (('Demand data hits:', arc_stats['demand_data_hits']),
                ('Demand data I/O hits:', arc_stats['demand_data_iohits']),
                ('Demand data misses:', arc_stats['demand_data_misses']))
     for title, value in dd_todo:
         prt_i2(title, f_perc(value, dd_total), f_hits(value))
     print()
 
     dm_total = int(arc_stats['demand_metadata_hits']) +\
         int(arc_stats['demand_metadata_iohits']) +\
         int(arc_stats['demand_metadata_misses'])
     prt_2('ARC demand metadata accesses:', f_perc(dm_total, all_accesses),
           f_hits(dm_total))
     dm_todo = (('Demand metadata hits:', arc_stats['demand_metadata_hits']),
                ('Demand metadata I/O hits:',
                 arc_stats['demand_metadata_iohits']),
                ('Demand metadata misses:', arc_stats['demand_metadata_misses']))
     for title, value in dm_todo:
         prt_i2(title, f_perc(value, dm_total), f_hits(value))
     print()
 
     pd_total = int(arc_stats['prefetch_data_hits']) +\
         int(arc_stats['prefetch_data_iohits']) +\
         int(arc_stats['prefetch_data_misses'])
     prt_2('ARC prefetch data accesses:', f_perc(pd_total, all_accesses),
           f_hits(pd_total))
     pd_todo = (('Prefetch data hits:', arc_stats['prefetch_data_hits']),
                ('Prefetch data I/O hits:', arc_stats['prefetch_data_iohits']),
                ('Prefetch data misses:', arc_stats['prefetch_data_misses']))
     for title, value in pd_todo:
         prt_i2(title, f_perc(value, pd_total), f_hits(value))
     print()
 
     pm_total = int(arc_stats['prefetch_metadata_hits']) +\
         int(arc_stats['prefetch_metadata_iohits']) +\
         int(arc_stats['prefetch_metadata_misses'])
     prt_2('ARC prefetch metadata accesses:', f_perc(pm_total, all_accesses),
           f_hits(pm_total))
     pm_todo = (('Prefetch metadata hits:',
                 arc_stats['prefetch_metadata_hits']),
                ('Prefetch metadata I/O hits:',
                 arc_stats['prefetch_metadata_iohits']),
                ('Prefetch metadata misses:',
                 arc_stats['prefetch_metadata_misses']))
     for title, value in pm_todo:
         prt_i2(title, f_perc(value, pm_total), f_hits(value))
     print()
 
     all_prefetches = int(arc_stats['predictive_prefetch'])+\
         int(arc_stats['prescient_prefetch'])
     prt_2('ARC predictive prefetches:',
            f_perc(arc_stats['predictive_prefetch'], all_prefetches),
            f_hits(arc_stats['predictive_prefetch']))
     prt_i2('Demand hits after predictive:',
            f_perc(arc_stats['demand_hit_predictive_prefetch'],
                   arc_stats['predictive_prefetch']),
            f_hits(arc_stats['demand_hit_predictive_prefetch']))
     prt_i2('Demand I/O hits after predictive:',
            f_perc(arc_stats['demand_iohit_predictive_prefetch'],
                   arc_stats['predictive_prefetch']),
            f_hits(arc_stats['demand_iohit_predictive_prefetch']))
     never = int(arc_stats['predictive_prefetch']) -\
         int(arc_stats['demand_hit_predictive_prefetch']) -\
         int(arc_stats['demand_iohit_predictive_prefetch'])
     prt_i2('Never demanded after predictive:',
            f_perc(never, arc_stats['predictive_prefetch']),
            f_hits(never))
     print()
 
     prt_2('ARC prescient prefetches:',
            f_perc(arc_stats['prescient_prefetch'], all_prefetches),
            f_hits(arc_stats['prescient_prefetch']))
     prt_i2('Demand hits after prescient:',
            f_perc(arc_stats['demand_hit_prescient_prefetch'],
                   arc_stats['prescient_prefetch']),
            f_hits(arc_stats['demand_hit_prescient_prefetch']))
     prt_i2('Demand I/O hits after prescient:',
            f_perc(arc_stats['demand_iohit_prescient_prefetch'],
                   arc_stats['prescient_prefetch']),
            f_hits(arc_stats['demand_iohit_prescient_prefetch']))
     never = int(arc_stats['prescient_prefetch'])-\
         int(arc_stats['demand_hit_prescient_prefetch'])-\
         int(arc_stats['demand_iohit_prescient_prefetch'])
     prt_i2('Never demanded after prescient:',
            f_perc(never, arc_stats['prescient_prefetch']),
            f_hits(never))
     print()
 
     print('ARC states hits of all accesses:')
     cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']),
                ('Most recently used (MRU):', arc_stats['mru_hits']),
                ('Most frequently used (MFU) ghost:',
                 arc_stats['mfu_ghost_hits']),
                ('Most recently used (MRU) ghost:',
                 arc_stats['mru_ghost_hits']),
                ('Uncached:', arc_stats['uncached_hits']))
     for title, value in cl_todo:
         prt_i2(title, f_perc(value, all_accesses), f_hits(value))
     print()
 
 
 def section_dmu(kstats_dict):
     """Collect information on the DMU"""
 
     zfetch_stats = isolate_section('zfetchstats', kstats_dict)
 
     zfetch_access_total = int(zfetch_stats['hits']) +\
         int(zfetch_stats['future']) + int(zfetch_stats['stride']) +\
         int(zfetch_stats['past']) + int(zfetch_stats['misses'])
 
     prt_1('DMU predictive prefetcher calls:', f_hits(zfetch_access_total))
     prt_i2('Stream hits:',
            f_perc(zfetch_stats['hits'], zfetch_access_total),
            f_hits(zfetch_stats['hits']))
     future = int(zfetch_stats['future']) + int(zfetch_stats['stride'])
     prt_i2('Hits ahead of stream:', f_perc(future, zfetch_access_total),
            f_hits(future))
     prt_i2('Hits behind stream:',
            f_perc(zfetch_stats['past'], zfetch_access_total),
            f_hits(zfetch_stats['past']))
     prt_i2('Stream misses:',
            f_perc(zfetch_stats['misses'], zfetch_access_total),
            f_hits(zfetch_stats['misses']))
     prt_i2('Streams limit reached:',
            f_perc(zfetch_stats['max_streams'], zfetch_stats['misses']),
            f_hits(zfetch_stats['max_streams']))
     prt_i1('Stream strides:', f_hits(zfetch_stats['stride']))
     prt_i1('Prefetches issued', f_hits(zfetch_stats['io_issued']))
     print()
 
 
 def section_l2arc(kstats_dict):
     """Collect information on L2ARC device if present. If not, tell user
     that we're skipping the section.
     """
 
     # The L2ARC statistics live in the same section as the normal ARC stuff
     arc_stats = isolate_section('arcstats', kstats_dict)
 
     if arc_stats['l2_size'] == '0':
         print('L2ARC not detected, skipping section\n')
         return
 
     l2_errors = int(arc_stats['l2_writes_error']) +\
         int(arc_stats['l2_cksum_bad']) +\
         int(arc_stats['l2_io_error'])
 
     l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses'])
     health = 'HEALTHY'
 
     if l2_errors > 0:
         health = 'DEGRADED'
 
     prt_1('L2ARC status:', health)
 
     l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'),
                ('Free on write:', 'l2_free_on_write'),
                ('R/W clashes:', 'l2_rw_clash'),
                ('Bad checksums:', 'l2_cksum_bad'),
                ('Read errors:', 'l2_io_error'),
                ('Write errors:', 'l2_writes_error'))
 
     for title, value in l2_todo:
         prt_i1(title, f_hits(arc_stats[value]))
 
     print()
     prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size']))
     prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']),
            f_bytes(arc_stats['l2_asize']))
     prt_i2('Header size:',
            f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']),
            f_bytes(arc_stats['l2_hdr_size']))
     prt_i2('MFU allocated size:',
            f_perc(arc_stats['l2_mfu_asize'], arc_stats['l2_asize']),
            f_bytes(arc_stats['l2_mfu_asize']))
     prt_i2('MRU allocated size:',
            f_perc(arc_stats['l2_mru_asize'], arc_stats['l2_asize']),
            f_bytes(arc_stats['l2_mru_asize']))
     prt_i2('Prefetch allocated size:',
            f_perc(arc_stats['l2_prefetch_asize'], arc_stats['l2_asize']),
            f_bytes(arc_stats['l2_prefetch_asize']))
     prt_i2('Data (buffer content) allocated size:',
            f_perc(arc_stats['l2_bufc_data_asize'], arc_stats['l2_asize']),
            f_bytes(arc_stats['l2_bufc_data_asize']))
     prt_i2('Metadata (buffer content) allocated size:',
            f_perc(arc_stats['l2_bufc_metadata_asize'], arc_stats['l2_asize']),
            f_bytes(arc_stats['l2_bufc_metadata_asize']))
 
     print()
     prt_1('L2ARC breakdown:', f_hits(l2_access_total))
     prt_i2('Hit ratio:',
            f_perc(arc_stats['l2_hits'], l2_access_total),
            f_hits(arc_stats['l2_hits']))
     prt_i2('Miss ratio:',
            f_perc(arc_stats['l2_misses'], l2_access_total),
            f_hits(arc_stats['l2_misses']))
 
     print()
     print('L2ARC I/O:')
     prt_i2('Reads:',
            f_bytes(arc_stats['l2_read_bytes']),
            f_hits(arc_stats['l2_hits']))
     prt_i2('Writes:',
            f_bytes(arc_stats['l2_write_bytes']),
            f_hits(arc_stats['l2_writes_sent']))
 
     print()
     print('L2ARC evicts:')
     prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached']))
     prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading']))
     print()
 
 
 def section_spl(*_):
     """Print the SPL parameters, if requested with alternative format
     and/or descriptions. This does not use kstats.
     """
 
     if sys.platform.startswith('freebsd'):
         # No SPL support in FreeBSD
         return
 
     spls = get_spl_params()
     keylist = sorted(spls.keys())
     print('Solaris Porting Layer (SPL):')
 
     if ARGS.desc:
         descriptions = get_descriptions('spl')
 
     for key in keylist:
         value = spls[key]
 
         if ARGS.desc:
             try:
                 print(INDENT+'#', descriptions[key])
             except KeyError:
                 print(INDENT+'# (No description found)')  # paranoid
 
         print(format_raw_line(key, value))
 
     print()
 
 
 def section_tunables(*_):
     """Print the tunables, if requested with alternative format and/or
     descriptions. This does not use kstasts.
     """
 
     tunables = get_tunable_params()
     keylist = sorted(tunables.keys())
     print('Tunables:')
 
     if ARGS.desc:
         descriptions = get_descriptions('zfs')
 
     for key in keylist:
         value = tunables[key]
 
         if ARGS.desc:
             try:
                 print(INDENT+'#', descriptions[key])
             except KeyError:
                 print(INDENT+'# (No description found)')  # paranoid
 
         print(format_raw_line(key, value))
 
     print()
 
 
 def section_zil(kstats_dict):
     """Collect information on the ZFS Intent Log. Some of the information
     taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h
     """
 
     zil_stats = isolate_section('zil', kstats_dict)
 
     prt_1('ZIL committed transactions:',
           f_hits(zil_stats['zil_itx_count']))
     prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count']))
     prt_i1('Flushes to stable storage:',
            f_hits(zil_stats['zil_commit_writer_count']))
     prt_i2('Transactions to SLOG storage pool:',
            f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']),
            f_hits(zil_stats['zil_itx_metaslab_slog_count']))
     prt_i2('Transactions to non-SLOG storage pool:',
            f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']),
            f_hits(zil_stats['zil_itx_metaslab_normal_count']))
     print()
 
 
 section_calls = {'arc': section_arc,
                  'archits': section_archits,
                  'dmu': section_dmu,
                  'l2arc': section_l2arc,
                  'spl': section_spl,
                  'tunables': section_tunables,
                  'zil': section_zil}
 
 
 def main():
     """Run program. The options to draw a graph and to print all data raw are
     treated separately because they come with their own call.
     """
 
+    # notify user for upcoming renaming in 2.4.0
+    abs_path = os.path.abspath(sys.argv[0].strip())
+    script_name = os.path.basename(abs_path)
+    if script_name != "zarcsummary":
+        sys.stderr.write("Note: this script will be renamed to zarcsummary in ")
+        sys.stderr.write("zfs 2.4.0. Please migrate ASAP.\n")
+
     kstats = get_kstats()
 
     if ARGS.graph:
         draw_graph(kstats)
         sys.exit(0)
 
     print_header()
 
     if ARGS.raw:
         print_raw(kstats)
 
     elif ARGS.section:
 
         try:
             section_calls[ARGS.section](kstats)
         except KeyError:
             print('Error: Section "{0}" unknown'.format(ARGS.section))
             sys.exit(1)
 
     elif ARGS.page:
         print('WARNING: Pages are deprecated, please use "--section"\n')
 
         pages_to_calls = {1: 'arc',
                           2: 'archits',
                           3: 'l2arc',
                           4: 'dmu',
                           5: 'vdev',
                           6: 'tunables'}
 
         try:
             call = pages_to_calls[ARGS.page]
         except KeyError:
             print('Error: Page "{0}" not supported'.format(ARGS.page))
             sys.exit(1)
         else:
             section_calls[call](kstats)
 
     else:
         # If no parameters were given, we print all sections. We might want to
         # change the sequence by hand
         calls = sorted(section_calls.keys())
 
         for section in calls:
             section_calls[section](kstats)
 
     sys.exit(0)
 
 
 if __name__ == '__main__':
     main()
diff --git a/sys/contrib/openzfs/cmd/arcstat.in b/sys/contrib/openzfs/cmd/arcstat.in
index 6f9abb39c3fb..e153eddb36cf 100755
--- a/sys/contrib/openzfs/cmd/arcstat.in
+++ b/sys/contrib/openzfs/cmd/arcstat.in
@@ -1,803 +1,812 @@
 #!/usr/bin/env @PYTHON_SHEBANG@
 # SPDX-License-Identifier: CDDL-1.0
 #
 # Print out ZFS ARC Statistics exported via kstat(1)
 # For a definition of fields, or usage, use arcstat -v
 #
 # This script was originally a fork of the original arcstat.pl (0.1)
 # by Neelakanth Nadgir, originally published on his Sun blog on
 # 09/18/2007
 #     http://blogs.sun.com/realneel/entry/zfs_arc_statistics
 #
 # A new version aimed to improve upon the original by adding features
 # and fixing bugs as needed.  This version was maintained by Mike
 # Harsch and was hosted in a public open source repository:
 #    http://github.com/mharsch/arcstat
 #
 # but has since moved to the illumos-gate repository.
 #
 # This Python port was written by John Hixson for FreeNAS, introduced
 # in commit e2c29f:
 #    https://github.com/freenas/freenas
 #
 # and has been improved by many people since.
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License, Version 1.0 only
 # (the "License").  You may not use this file except in compliance
 # with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 #
 # Fields have a fixed width. Every interval, we fill the "v"
 # hash with its corresponding value (v[field]=value) using calculate().
 # @hdr is the array of fields that needs to be printed, so we
 # just iterate over this array and print the values using our pretty printer.
 #
 # This script must remain compatible with Python 3.6+.
 #
 
 import sys
 import time
 import getopt
 import re
 import copy
+import os
 
 from signal import signal, SIGINT, SIGWINCH, SIG_DFL
 
 
 cols = {
     # HDR:        [Size, Scale, Description]
     "time":       [8, -1, "Time"],
     "hits":       [4, 1000, "ARC hits per second"],
     "iohs":       [4, 1000, "ARC I/O hits per second"],
     "miss":       [4, 1000, "ARC misses per second"],
     "read":       [4, 1000, "Total ARC accesses per second"],
     "hit%":       [4, 100, "ARC hit percentage"],
     "ioh%":       [4, 100, "ARC I/O hit percentage"],
     "miss%":      [5, 100, "ARC miss percentage"],
     "dhit":       [4, 1000, "Demand hits per second"],
     "dioh":       [4, 1000, "Demand I/O hits per second"],
     "dmis":       [4, 1000, "Demand misses per second"],
     "dh%":        [3, 100, "Demand hit percentage"],
     "di%":        [3, 100, "Demand I/O hit percentage"],
     "dm%":        [3, 100, "Demand miss percentage"],
     "ddhit":      [5, 1000, "Demand data hits per second"],
     "ddioh":      [5, 1000, "Demand data I/O hits per second"],
     "ddmis":      [5, 1000, "Demand data misses per second"],
     "ddh%":       [4, 100, "Demand data hit percentage"],
     "ddi%":       [4, 100, "Demand data I/O hit percentage"],
     "ddm%":       [4, 100, "Demand data miss percentage"],
     "dmhit":      [5, 1000, "Demand metadata hits per second"],
     "dmioh":      [5, 1000, "Demand metadata I/O hits per second"],
     "dmmis":      [5, 1000, "Demand metadata misses per second"],
     "dmh%":       [4, 100, "Demand metadata hit percentage"],
     "dmi%":       [4, 100, "Demand metadata I/O hit percentage"],
     "dmm%":       [4, 100, "Demand metadata miss percentage"],
     "phit":       [4, 1000, "Prefetch hits per second"],
     "pioh":       [4, 1000, "Prefetch I/O hits per second"],
     "pmis":       [4, 1000, "Prefetch misses per second"],
     "ph%":        [3, 100, "Prefetch hits percentage"],
     "pi%":        [3, 100, "Prefetch I/O hits percentage"],
     "pm%":        [3, 100, "Prefetch miss percentage"],
     "pdhit":      [5, 1000, "Prefetch data hits per second"],
     "pdioh":      [5, 1000, "Prefetch data I/O hits per second"],
     "pdmis":      [5, 1000, "Prefetch data misses per second"],
     "pdh%":       [4, 100, "Prefetch data hits percentage"],
     "pdi%":       [4, 100, "Prefetch data I/O hits percentage"],
     "pdm%":       [4, 100, "Prefetch data miss percentage"],
     "pmhit":      [5, 1000, "Prefetch metadata hits per second"],
     "pmioh":      [5, 1000, "Prefetch metadata I/O hits per second"],
     "pmmis":      [5, 1000, "Prefetch metadata misses per second"],
     "pmh%":       [4, 100, "Prefetch metadata hits percentage"],
     "pmi%":       [4, 100, "Prefetch metadata I/O hits percentage"],
     "pmm%":       [4, 100, "Prefetch metadata miss percentage"],
     "mhit":       [4, 1000, "Metadata hits per second"],
     "mioh":       [4, 1000, "Metadata I/O hits per second"],
     "mmis":       [4, 1000, "Metadata misses per second"],
     "mread":      [5, 1000, "Metadata accesses per second"],
     "mh%":        [3, 100, "Metadata hit percentage"],
     "mi%":        [3, 100, "Metadata I/O hit percentage"],
     "mm%":        [3, 100, "Metadata miss percentage"],
     "arcsz":      [5, 1024, "ARC size"],
     "size":       [5, 1024, "ARC size"],
     "c":          [5, 1024, "ARC target size"],
     "mfu":        [4, 1000, "MFU list hits per second"],
     "mru":        [4, 1000, "MRU list hits per second"],
     "mfug":       [4, 1000, "MFU ghost list hits per second"],
     "mrug":       [4, 1000, "MRU ghost list hits per second"],
     "unc":        [4, 1000, "Uncached list hits per second"],
     "eskip":      [5, 1000, "evict_skip per second"],
     "el2skip":    [7, 1000, "evict skip, due to l2 writes, per second"],
     "el2cach":    [7, 1024, "Size of L2 cached evictions per second"],
     "el2el":      [5, 1024, "Size of L2 eligible evictions per second"],
     "el2mfu":     [6, 1024, "Size of L2 eligible MFU evictions per second"],
     "el2mru":     [6, 1024, "Size of L2 eligible MRU evictions per second"],
     "el2inel":    [7, 1024, "Size of L2 ineligible evictions per second"],
     "mtxmis":     [6, 1000, "mutex_miss per second"],
     "dread":      [5, 1000, "Demand accesses per second"],
     "ddread":     [6, 1000, "Demand data accesses per second"],
     "dmread":     [6, 1000, "Demand metadata accesses per second"],
     "pread":      [5, 1000, "Prefetch accesses per second"],
     "pdread":     [6, 1000, "Prefetch data accesses per second"],
     "pmread":     [6, 1000, "Prefetch metadata accesses per second"],
     "l2hits":     [6, 1000, "L2ARC hits per second"],
     "l2miss":     [6, 1000, "L2ARC misses per second"],
     "l2read":     [6, 1000, "Total L2ARC accesses per second"],
     "l2hit%":     [6, 100, "L2ARC access hit percentage"],
     "l2miss%":    [7, 100, "L2ARC access miss percentage"],
     "l2pref":     [6, 1024, "L2ARC prefetch allocated size"],
     "l2mfu":      [5, 1024, "L2ARC MFU allocated size"],
     "l2mru":      [5, 1024, "L2ARC MRU allocated size"],
     "l2data":     [6, 1024, "L2ARC data allocated size"],
     "l2meta":     [6, 1024, "L2ARC metadata allocated size"],
     "l2pref%":    [7, 100, "L2ARC prefetch percentage"],
     "l2mfu%":     [6, 100, "L2ARC MFU percentage"],
     "l2mru%":     [6, 100, "L2ARC MRU percentage"],
     "l2data%":    [7, 100, "L2ARC data percentage"],
     "l2meta%":    [7, 100, "L2ARC metadata percentage"],
     "l2asize":    [7, 1024, "Actual (compressed) size of the L2ARC"],
     "l2size":     [6, 1024, "Size of the L2ARC"],
     "l2bytes":    [7, 1024, "Bytes read per second from the L2ARC"],
     "l2wbytes":   [8, 1024, "Bytes written per second to the L2ARC"],
     "grow":       [4, 1000, "ARC grow disabled"],
     "need":       [5, 1024, "ARC reclaim need"],
     "free":       [5, 1024, "ARC free memory"],
     "avail":      [5, 1024, "ARC available memory"],
     "waste":      [5, 1024, "Wasted memory due to round up to pagesize"],
     "ztotal":     [6, 1000, "zfetch total prefetcher calls per second"],
     "zhits":      [5, 1000, "zfetch stream hits per second"],
     "zahead":     [6, 1000, "zfetch hits ahead of streams per second"],
     "zpast":      [5, 1000, "zfetch hits behind streams per second"],
     "zmisses":    [7, 1000, "zfetch stream misses per second"],
     "zmax":       [4, 1000, "zfetch limit reached per second"],
     "zfuture":    [7, 1000, "zfetch stream future per second"],
     "zstride":    [7, 1000, "zfetch stream strides per second"],
     "zissued":    [7, 1000, "zfetch prefetches issued per second"],
     "zactive":    [7, 1000, "zfetch prefetches active per second"],
 }
 
 # ARC structural breakdown from arc_summary
 structfields = {
     "cmp":      ["compressed", "Compressed"],
     "ovh":      ["overhead", "Overhead"],
     "bon":      ["bonus", "Bonus"],
     "dno":      ["dnode", "Dnode"],
     "dbu":      ["dbuf", "Dbuf"],
     "hdr":      ["hdr", "Header"],
     "l2h":      ["l2_hdr", "L2 header"],
     "abd":      ["abd_chunk_waste", "ABD chunk waste"],
 }
 structstats = {                             # size stats
     "percent":  "size",                     # percentage of this value
     "sz":       ["_size", "size"],
 }
 
 # ARC types breakdown from arc_summary
 typefields = {
     "data":     ["data", "ARC data"],
     "meta":     ["metadata", "ARC metadata"],
 }
 typestats = {                               # size stats
     "percent":  "cachessz",                 # percentage of this value
     "tg":       ["_target", "target"],
     "sz":       ["_size", "size"],
 }
 
 # ARC states breakdown from arc_summary
 statefields = {
     "ano":      ["anon", "Anonymous"],
     "mfu":      ["mfu", "MFU"],
     "mru":      ["mru", "MRU"],
     "unc":      ["uncached", "Uncached"],
 }
 targetstats = {
     "percent":  "cachessz",                 # percentage of this value
     "fields":   ["mfu", "mru"],             # only applicable to these fields
     "tg":       ["_target", "target"],
     "dt":       ["_data_target", "data target"],
     "mt":       ["_metadata_target", "metadata target"],
 }
 statestats = {                              # size stats
     "percent":  "cachessz",                 # percentage of this value
     "sz":       ["_size", "size"],
     "da":       ["_data", "data size"],
     "me":       ["_metadata", "metadata size"],
     "ed":       ["_evictable_data", "evictable data size"],
     "em":       ["_evictable_metadata", "evictable metadata size"],
 }
 ghoststats = {
     "fields":   ["mfu", "mru"],             # only applicable to these fields
     "gsz":      ["_ghost_size", "ghost size"],
     "gd":       ["_ghost_data", "ghost data size"],
     "gm":       ["_ghost_metadata", "ghost metadata size"],
 }
 
 # fields and stats
 fieldstats = [
     [structfields, structstats],
     [typefields, typestats],
     [statefields, targetstats, statestats, ghoststats],
 ]
 for fs in fieldstats:
     fields, stats = fs[0], fs[1:]
     for field, fieldval in fields.items():
         for group in stats:
             for stat, statval in group.items():
                 if stat in ["fields", "percent"] or \
                     ("fields" in group and field not in group["fields"]):
                     continue
                 colname = field + stat
                 coldesc = fieldval[1] + " " + statval[1]
                 cols[colname] = [len(colname), 1024, coldesc]
                 if "percent" in group:
                     cols[colname + "%"] = [len(colname) + 1, 100, \
                         coldesc + " percentage"]
 
 v = {}
 hdr = ["time", "read", "ddread", "ddh%", "dmread", "dmh%", "pread", "ph%",
        "size", "c", "avail"]
 xhdr = ["time", "mfu", "mru", "mfug", "mrug", "unc", "eskip", "mtxmis",
         "dread", "pread", "read"]
 zhdr = ["time", "ztotal", "zhits", "zahead", "zpast", "zmisses", "zmax",
         "zfuture", "zstride", "zissued", "zactive"]
 sint = 1               # Default interval is 1 second
 count = 1              # Default count is 1
 hdr_intr = 20          # Print header every 20 lines of output
 opfile = None
 sep = "  "              # Default separator is 2 spaces
 l2exist = False
 cmd = ("Usage: arcstat [-havxp] [-f fields] [-o file] [-s string] [interval "
        "[count]]\n")
 cur = {}
 d = {}
 out = None
 kstat = None
 pretty_print = True
 
 
 if sys.platform.startswith('freebsd'):
     # Requires py-sysctl on FreeBSD
     import sysctl
 
     def kstat_update():
         global kstat
 
         k = [ctl for ctl in sysctl.filter('kstat.zfs.misc.arcstats')
              if ctl.type != sysctl.CTLTYPE_NODE]
         k += [ctl for ctl in sysctl.filter('kstat.zfs.misc.zfetchstats')
              if ctl.type != sysctl.CTLTYPE_NODE]
 
         if not k:
             sys.exit(1)
 
         kstat = {}
 
         for s in k:
             if not s:
                 continue
 
             name, value = s.name, s.value
 
             if "arcstats" in name:
                 # Trims 'kstat.zfs.misc.arcstats' from the name
                 kstat[name[24:]] = int(value)
             else:
                 kstat["zfetch_" + name[27:]] = int(value)
 
 elif sys.platform.startswith('linux'):
     def kstat_update():
         global kstat
 
         k1 = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')]
 
         k2 = ["zfetch_" + line.strip() for line in
              open('/proc/spl/kstat/zfs/zfetchstats')]
 
         if k1 is None or k2 is None:
             sys.exit(1)
 
         del k1[0:2]
         del k2[0:2]
         k = k1 + k2
         kstat = {}
 
         for s in k:
             if not s:
                 continue
 
             name, unused, value = s.split()
             kstat[name] = int(value)
 
 
 def detailed_usage():
     sys.stderr.write("%s\n" % cmd)
     sys.stderr.write("Field definitions are as follows:\n")
     for key in cols:
         sys.stderr.write("%11s : %s\n" % (key, cols[key][2]))
     sys.stderr.write("\n")
 
     sys.exit(0)
 
 
 def usage():
     sys.stderr.write("%s\n" % cmd)
     sys.stderr.write("\t -h : Print this help message\n")
     sys.stderr.write("\t -a : Print all possible stats\n")
     sys.stderr.write("\t -v : List all possible field headers and definitions"
                      "\n")
     sys.stderr.write("\t -x : Print extended stats\n")
     sys.stderr.write("\t -z : Print zfetch stats\n")
     sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n")
     sys.stderr.write("\t -o : Redirect output to the specified file\n")
     sys.stderr.write("\t -s : Override default field separator with custom "
                      "character or string\n")
     sys.stderr.write("\t -p : Disable auto-scaling of numerical fields\n")
     sys.stderr.write("\nExamples:\n")
     sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n")
     sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n")
     sys.stderr.write("\tarcstat -v\n")
     sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n")
     sys.stderr.write("\n")
 
     sys.exit(1)
 
 
 def snap_stats():
     global cur
     global kstat
 
     prev = copy.deepcopy(cur)
     kstat_update()
 
     cur = kstat
 
     # fill in additional values from arc_summary
     cur["caches_size"] = caches_size = cur["anon_data"]+cur["anon_metadata"]+\
         cur["mfu_data"]+cur["mfu_metadata"]+cur["mru_data"]+cur["mru_metadata"]+\
         cur["uncached_data"]+cur["uncached_metadata"]
     s = 4294967296
     pd = cur["pd"]
     pm = cur["pm"]
     meta = cur["meta"]
     v = (s-int(pd))*(s-int(meta))/s
     cur["mfu_data_target"] = v / 65536 * caches_size / 65536
     v = (s-int(pm))*int(meta)/s
     cur["mfu_metadata_target"] = v / 65536 * caches_size / 65536
     v = int(pd)*(s-int(meta))/s
     cur["mru_data_target"] = v / 65536 * caches_size / 65536
     v = int(pm)*int(meta)/s
     cur["mru_metadata_target"] = v / 65536 * caches_size / 65536
 
     cur["data_target"] = cur["mfu_data_target"] + cur["mru_data_target"]
     cur["metadata_target"] = cur["mfu_metadata_target"] + cur["mru_metadata_target"]
     cur["mfu_target"] = cur["mfu_data_target"] + cur["mfu_metadata_target"]
     cur["mru_target"] = cur["mru_data_target"] + cur["mru_metadata_target"]
 
     for key in cur:
         if re.match(key, "class"):
             continue
         if key in prev:
             d[key] = cur[key] - prev[key]
         else:
             d[key] = cur[key]
 
 
 def isint(num):
     if isinstance(num, float):
         return num.is_integer()
     if isinstance(num, int):
         return True
     return False
 
 
 def prettynum(sz, scale, num=0):
     suffix = ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']
     index = 0
 
     # Special case for date field
     if scale == -1:
         return "%s" % num
 
     if scale != 100:
         while abs(num) > scale and index < 5:
             num = num / scale
             index += 1
 
     width = sz - (0 if index == 0 else 1)
     intlen = len("%.0f" % num)              # %.0f rounds to nearest int
     if sint == 1 and isint(num) or width < intlen + 2:
         decimal = 0
     else:
         decimal = 1
     return "%*.*f%s" % (width, decimal, num, suffix[index])
 
 
 def print_values():
     global hdr
     global sep
     global v
     global pretty_print
 
     if pretty_print:
         fmt = lambda col: prettynum(cols[col][0], cols[col][1], v[col])
     else:
         fmt = lambda col: str(v[col])
 
     sys.stdout.write(sep.join(fmt(col) for col in hdr))
     sys.stdout.write("\n")
     sys.stdout.flush()
 
 
 def print_header():
     global hdr
     global sep
     global pretty_print
 
     if pretty_print:
         fmt = lambda col: "%*s" % (cols[col][0], col)
     else:
         fmt = lambda col: col
 
     sys.stdout.write(sep.join(fmt(col) for col in hdr))
     sys.stdout.write("\n")
 
 
 def get_terminal_lines():
     try:
         import fcntl
         import termios
         import struct
         data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234')
         sz = struct.unpack('hh', data)
         return sz[0]
     except Exception:
         pass
 
 
 def update_hdr_intr():
     global hdr_intr
 
     lines = get_terminal_lines()
     if lines and lines > 3:
         hdr_intr = lines - 3
 
 
 def resize_handler(signum, frame):
     update_hdr_intr()
 
 
 def init():
     global sint
     global count
     global hdr
     global xhdr
     global zhdr
     global opfile
     global sep
     global out
     global l2exist
     global pretty_print
 
     desired_cols = None
     aflag = False
     xflag = False
     hflag = False
     vflag = False
     zflag = False
     i = 1
 
     try:
         opts, args = getopt.getopt(
             sys.argv[1:],
             "axzo:hvs:f:p",
             [
                 "all",
                 "extended",
                 "zfetch",
                 "outfile",
                 "help",
                 "verbose",
                 "separator",
                 "columns",
                 "parsable"
             ]
         )
     except getopt.error as msg:
         sys.stderr.write("Error: %s\n" % str(msg))
         usage()
         opts = None
 
     for opt, arg in opts:
         if opt in ('-a', '--all'):
             aflag = True
         if opt in ('-x', '--extended'):
             xflag = True
         if opt in ('-o', '--outfile'):
             opfile = arg
             i += 1
         if opt in ('-h', '--help'):
             hflag = True
         if opt in ('-v', '--verbose'):
             vflag = True
         if opt in ('-s', '--separator'):
             sep = arg
             i += 1
         if opt in ('-f', '--columns'):
             desired_cols = arg
             i += 1
         if opt in ('-p', '--parsable'):
             pretty_print = False
         if opt in ('-z', '--zfetch'):
             zflag = True
         i += 1
 
     argv = sys.argv[i:]
     sint = int(argv[0]) if argv else sint
     count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1)
 
     if hflag or (xflag and zflag) or ((zflag or xflag) and desired_cols):
         usage()
 
     if vflag:
         detailed_usage()
 
     if xflag:
         hdr = xhdr
 
     if zflag:
         hdr = zhdr
 
     update_hdr_intr()
 
     # check if L2ARC exists
     snap_stats()
     l2_size = cur.get("l2_size")
     if l2_size:
         l2exist = True
 
     if desired_cols:
         hdr = desired_cols.split(",")
 
         invalid = []
         incompat = []
         for ele in hdr:
             if ele not in cols:
                 invalid.append(ele)
             elif not l2exist and ele.startswith("l2"):
                 sys.stdout.write("No L2ARC Here\n%s\n" % ele)
                 incompat.append(ele)
 
         if len(invalid) > 0:
             sys.stderr.write("Invalid column definition! -- %s\n" % invalid)
             usage()
 
         if len(incompat) > 0:
             sys.stderr.write("Incompatible field specified! -- %s\n" %
                              incompat)
             usage()
 
     if aflag:
         if l2exist:
             hdr = cols.keys()
         else:
             hdr = [col for col in cols.keys() if not col.startswith("l2")]
 
     if opfile:
         try:
             out = open(opfile, "w")
             sys.stdout = out
 
         except IOError:
             sys.stderr.write("Cannot open %s for writing\n" % opfile)
             sys.exit(1)
 
 
 def calculate():
     global d
     global v
     global l2exist
 
     v = dict()
     v["time"] = time.strftime("%H:%M:%S", time.localtime())
     v["hits"] = d["hits"] / sint
     v["iohs"] = d["iohits"] / sint
     v["miss"] = d["misses"] / sint
     v["read"] = v["hits"] + v["iohs"] + v["miss"]
     v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0
     v["ioh%"] = 100 * v["iohs"] / v["read"] if v["read"] > 0 else 0
     v["miss%"] = 100 - v["hit%"] - v["ioh%"] if v["read"] > 0 else 0
 
     v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint
     v["dioh"] = (d["demand_data_iohits"] + d["demand_metadata_iohits"]) / sint
     v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint
 
     v["dread"] = v["dhit"] + v["dioh"] + v["dmis"]
     v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0
     v["di%"] = 100 * v["dioh"] / v["dread"] if v["dread"] > 0 else 0
     v["dm%"] = 100 - v["dh%"] - v["di%"] if v["dread"] > 0 else 0
 
     v["ddhit"] = d["demand_data_hits"] / sint
     v["ddioh"] = d["demand_data_iohits"] / sint
     v["ddmis"] = d["demand_data_misses"] / sint
 
     v["ddread"] = v["ddhit"] + v["ddioh"] + v["ddmis"]
     v["ddh%"] = 100 * v["ddhit"] / v["ddread"] if v["ddread"] > 0 else 0
     v["ddi%"] = 100 * v["ddioh"] / v["ddread"] if v["ddread"] > 0 else 0
     v["ddm%"] = 100 - v["ddh%"] - v["ddi%"] if v["ddread"] > 0 else 0
 
     v["dmhit"] = d["demand_metadata_hits"] / sint
     v["dmioh"] = d["demand_metadata_iohits"] / sint
     v["dmmis"] = d["demand_metadata_misses"] / sint
 
     v["dmread"] = v["dmhit"] + v["dmioh"] + v["dmmis"]
     v["dmh%"] = 100 * v["dmhit"] / v["dmread"] if v["dmread"] > 0 else 0
     v["dmi%"] = 100 * v["dmioh"] / v["dmread"] if v["dmread"] > 0 else 0
     v["dmm%"] = 100 - v["dmh%"] - v["dmi%"] if v["dmread"] > 0 else 0
 
     v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint
     v["pioh"] = (d["prefetch_data_iohits"] +
                  d["prefetch_metadata_iohits"]) / sint
     v["pmis"] = (d["prefetch_data_misses"] +
                  d["prefetch_metadata_misses"]) / sint
 
     v["pread"] = v["phit"] + v["pioh"] + v["pmis"]
     v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0
     v["pi%"] = 100 * v["pioh"] / v["pread"] if v["pread"] > 0 else 0
     v["pm%"] = 100 - v["ph%"] - v["pi%"] if v["pread"] > 0 else 0
 
     v["pdhit"] = d["prefetch_data_hits"] / sint
     v["pdioh"] = d["prefetch_data_iohits"] / sint
     v["pdmis"] = d["prefetch_data_misses"] / sint
 
     v["pdread"] = v["pdhit"] + v["pdioh"] + v["pdmis"]
     v["pdh%"] = 100 * v["pdhit"] / v["pdread"] if v["pdread"] > 0 else 0
     v["pdi%"] = 100 * v["pdioh"] / v["pdread"] if v["pdread"] > 0 else 0
     v["pdm%"] = 100 - v["pdh%"] - v["pdi%"] if v["pdread"] > 0 else 0
 
     v["pmhit"] = d["prefetch_metadata_hits"] / sint
     v["pmioh"] = d["prefetch_metadata_iohits"] / sint
     v["pmmis"] = d["prefetch_metadata_misses"] / sint
 
     v["pmread"] = v["pmhit"] + v["pmioh"] + v["pmmis"]
     v["pmh%"] = 100 * v["pmhit"] / v["pmread"] if v["pmread"] > 0 else 0
     v["pmi%"] = 100 * v["pmioh"] / v["pmread"] if v["pmread"] > 0 else 0
     v["pmm%"] = 100 - v["pmh%"] - v["pmi%"] if v["pmread"] > 0 else 0
 
     v["mhit"] = (d["prefetch_metadata_hits"] +
                  d["demand_metadata_hits"]) / sint
     v["mioh"] = (d["prefetch_metadata_iohits"] +
                  d["demand_metadata_iohits"]) / sint
     v["mmis"] = (d["prefetch_metadata_misses"] +
                  d["demand_metadata_misses"]) / sint
 
     v["mread"] = v["mhit"] + v["mioh"] + v["mmis"]
     v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0
     v["mi%"] = 100 * v["mioh"] / v["mread"] if v["mread"] > 0 else 0
     v["mm%"] = 100 - v["mh%"] - v["mi%"] if v["mread"] > 0 else 0
 
     v["arcsz"] = cur["size"]
     v["size"] = cur["size"]
     v["c"] = cur["c"]
     v["mfu"] = d["mfu_hits"] / sint
     v["mru"] = d["mru_hits"] / sint
     v["mrug"] = d["mru_ghost_hits"] / sint
     v["mfug"] = d["mfu_ghost_hits"] / sint
     v["unc"] = d["uncached_hits"] / sint
     v["eskip"] = d["evict_skip"] / sint
     v["el2skip"] = d["evict_l2_skip"] / sint
     v["el2cach"] = d["evict_l2_cached"] / sint
     v["el2el"] = d["evict_l2_eligible"] / sint
     v["el2mfu"] = d["evict_l2_eligible_mfu"] / sint
     v["el2mru"] = d["evict_l2_eligible_mru"] / sint
     v["el2inel"] = d["evict_l2_ineligible"] / sint
     v["mtxmis"] = d["mutex_miss"] / sint
     v["ztotal"] = (d["zfetch_hits"] + d["zfetch_future"] + d["zfetch_stride"] +
                    d["zfetch_past"] + d["zfetch_misses"]) / sint
     v["zhits"] = d["zfetch_hits"] / sint
     v["zahead"] = (d["zfetch_future"] + d["zfetch_stride"]) / sint
     v["zpast"] = d["zfetch_past"] / sint
     v["zmisses"] = d["zfetch_misses"] / sint
     v["zmax"] = d["zfetch_max_streams"] / sint
     v["zfuture"] = d["zfetch_future"] / sint
     v["zstride"] = d["zfetch_stride"] / sint
     v["zissued"] = d["zfetch_io_issued"] / sint
     v["zactive"] = d["zfetch_io_active"] / sint
 
     # ARC structural breakdown, ARC types breakdown, ARC states breakdown
     v["cachessz"] = cur["caches_size"]
     for fs in fieldstats:
         fields, stats = fs[0], fs[1:]
         for field, fieldval in fields.items():
             for group in stats:
                 for stat, statval in group.items():
                     if stat in ["fields", "percent"] or \
                         ("fields" in group and field not in group["fields"]):
                         continue
                     colname = field + stat
                     v[colname] = cur[fieldval[0] + statval[0]]
                     if "percent" in group:
                         v[colname + "%"] = 100 * v[colname] / \
                             v[group["percent"]] if v[group["percent"]] > 0 else 0
 
     if l2exist:
         l2asize = cur["l2_asize"]
         v["l2hits"] = d["l2_hits"] / sint
         v["l2miss"] = d["l2_misses"] / sint
         v["l2read"] = v["l2hits"] + v["l2miss"]
         v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0
 
         v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0
         v["l2asize"] = l2asize
         v["l2size"] = cur["l2_size"]
         v["l2bytes"] = d["l2_read_bytes"] / sint
         v["l2wbytes"] = d["l2_write_bytes"] / sint
 
         v["l2pref"] = cur["l2_prefetch_asize"]
         v["l2mfu"] = cur["l2_mfu_asize"]
         v["l2mru"] = cur["l2_mru_asize"]
         v["l2data"] = cur["l2_bufc_data_asize"]
         v["l2meta"] = cur["l2_bufc_metadata_asize"]
         v["l2pref%"] = 100 * v["l2pref"] / l2asize if l2asize > 0 else 0
         v["l2mfu%"] = 100 * v["l2mfu"] / l2asize if l2asize > 0 else 0
         v["l2mru%"] = 100 * v["l2mru"] / l2asize if l2asize > 0 else 0
         v["l2data%"] = 100 * v["l2data"] / l2asize if l2asize > 0 else 0
         v["l2meta%"] = 100 * v["l2meta"] / l2asize if l2asize > 0 else 0
 
     v["grow"] = 0 if cur["arc_no_grow"] else 1
     v["need"] = cur["arc_need_free"]
     v["free"] = cur["memory_free_bytes"]
     v["avail"] = cur["memory_available_bytes"]
     v["waste"] = cur["abd_chunk_waste_size"]
 
 
 def main():
+
+    # notify user for upcoming renaming in 2.4.0
+    abs_path = os.path.abspath(sys.argv[0].strip())
+    script_name = os.path.basename(abs_path)
+    if script_name != "zarcstat":
+        sys.stderr.write("Note: this script will be renamed to zarcstat in ")
+        sys.stderr.write("zfs 2.4.0. Please migrate ASAP.\n")
+
     global sint
     global count
     global hdr_intr
 
     i = 0
     count_flag = 0
 
     init()
     if count > 0:
         count_flag = 1
 
     signal(SIGINT, SIG_DFL)
     signal(SIGWINCH, resize_handler)
     while True:
         if i == 0:
             print_header()
 
         snap_stats()
         calculate()
         print_values()
 
         if count_flag == 1:
             if count <= 1:
                 break
             count -= 1
 
         i = 0 if i >= hdr_intr else i + 1
         time.sleep(sint)
 
     if out:
         out.close()
 
 
 if __name__ == '__main__':
     main()
diff --git a/sys/contrib/openzfs/cmd/zdb/zdb.c b/sys/contrib/openzfs/cmd/zdb/zdb.c
index adaa5cd10961..134c258a1e32 100644
--- a/sys/contrib/openzfs/cmd/zdb/zdb.c
+++ b/sys/contrib/openzfs/cmd/zdb/zdb.c
@@ -1,9984 +1,9984 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc.
  * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC.
  * Copyright (c) 2015, 2017, Intel Corporation.
  * Copyright (c) 2020 Datto Inc.
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  * Copyright (c) 2021 Allan Jude
  * Copyright (c) 2021 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2023, 2024, Klara Inc.
  * Copyright (c) 2023, Rob Norris <robn@despairlabs.com>
  */
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <getopt.h>
 #include <openssl/evp.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_sa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/zil_impl.h>
 #include <sys/stat.h>
 #include <sys/resource.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_traverse.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
 #include <sys/arc_impl.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/blkptr.h>
 #include <sys/dsl_crypt.h>
 #include <sys/dsl_scan.h>
 #include <sys/btree.h>
 #include <sys/brt.h>
 #include <sys/brt_impl.h>
 #include <zfs_comutil.h>
 #include <sys/zstd/zstd.h>
 #include <sys/backtrace.h>
 
 #include <libnvpair.h>
 #include <libzutil.h>
 #include <libzfs_core.h>
 
 #include <libzdb.h>
 
 #include "zdb.h"
 
 
 extern int reference_tracking_enable;
 extern int zfs_recover;
 extern uint_t zfs_vdev_async_read_max_active;
 extern boolean_t spa_load_verify_dryrun;
 extern boolean_t spa_mode_readable_spacemaps;
 extern uint_t zfs_reconstruct_indirect_combinations_max;
 extern uint_t zfs_btree_verify_intensity;
 
 static const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
 typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size);
 
 static uint64_t *zopt_metaslab = NULL;
 static unsigned zopt_metaslab_args = 0;
 
 
 static zopt_object_range_t *zopt_object_ranges = NULL;
 static unsigned zopt_object_args = 0;
 
 static int flagbits[256];
 
 
 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */
 static int leaked_objects = 0;
 static zfs_range_tree_t *mos_refd_objs;
 static spa_t *spa;
 static objset_t *os;
 static boolean_t kernel_init_done;
 static boolean_t corruption_found = B_FALSE;
 
 static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *,
     boolean_t);
 static void mos_obj_refd(uint64_t);
 static void mos_obj_refd_multiple(uint64_t);
 static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx);
 
 
 
 static void zdb_print_blkptr(const blkptr_t *bp, int flags);
 static void zdb_exit(int reason);
 
 typedef struct sublivelist_verify_block_refcnt {
 	/* block pointer entry in livelist being verified */
 	blkptr_t svbr_blk;
 
 	/*
 	 * Refcount gets incremented to 1 when we encounter the first
 	 * FREE entry for the svfbr block pointer and a node for it
 	 * is created in our ZDB verification/tracking metadata.
 	 *
 	 * As we encounter more FREE entries we increment this counter
 	 * and similarly decrement it whenever we find the respective
 	 * ALLOC entries for this block.
 	 *
 	 * When the refcount gets to 0 it means that all the FREE and
 	 * ALLOC entries of this block have paired up and we no longer
 	 * need to track it in our verification logic (e.g. the node
 	 * containing this struct in our verification data structure
 	 * should be freed).
 	 *
 	 * [refer to sublivelist_verify_blkptr() for the actual code]
 	 */
 	uint32_t svbr_refcnt;
 } sublivelist_verify_block_refcnt_t;
 
 static int
 sublivelist_block_refcnt_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_refcnt_t *l = larg;
 	const sublivelist_verify_block_refcnt_t *r = rarg;
 	return (livelist_compare(&l->svbr_blk, &r->svbr_blk));
 }
 
 static int
 sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free,
     dmu_tx_t *tx)
 {
 	ASSERT0P(tx);
 	struct sublivelist_verify *sv = arg;
 	sublivelist_verify_block_refcnt_t current = {
 			.svbr_blk = *bp,
 
 			/*
 			 * Start with 1 in case this is the first free entry.
 			 * This field is not used for our B-Tree comparisons
 			 * anyway.
 			 */
 			.svbr_refcnt = 1,
 	};
 
 	zfs_btree_index_t where;
 	sublivelist_verify_block_refcnt_t *pair =
 	    zfs_btree_find(&sv->sv_pair, &current, &where);
 	if (free) {
 		if (pair == NULL) {
 			/* first free entry for this block pointer */
 			zfs_btree_add(&sv->sv_pair, &current);
 		} else {
 			pair->svbr_refcnt++;
 		}
 	} else {
 		if (pair == NULL) {
 			/* block that is currently marked as allocated */
 			for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
 				if (DVA_IS_EMPTY(&bp->blk_dva[i]))
 					break;
 				sublivelist_verify_block_t svb = {
 				    .svb_dva = bp->blk_dva[i],
 				    .svb_allocated_txg =
 				    BP_GET_BIRTH(bp)
 				};
 
 				if (zfs_btree_find(&sv->sv_leftover, &svb,
 				    &where) == NULL) {
 					zfs_btree_add_idx(&sv->sv_leftover,
 					    &svb, &where);
 				}
 			}
 		} else {
 			/* alloc matches a free entry */
 			pair->svbr_refcnt--;
 			if (pair->svbr_refcnt == 0) {
 				/* all allocs and frees have been matched */
 				zfs_btree_remove_idx(&sv->sv_pair, &where);
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle)
 {
 	int err;
 	struct sublivelist_verify *sv = args;
 
 	zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL,
 	    sizeof (sublivelist_verify_block_refcnt_t));
 
 	err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr,
 	    sv, NULL);
 
 	sublivelist_verify_block_refcnt_t *e;
 	zfs_btree_index_t *cookie = NULL;
 	while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 		    &e->svbr_blk, B_TRUE);
 		(void) printf("\tERROR: %d unmatched FREE(s): %s\n",
 		    e->svbr_refcnt, blkbuf);
 		corruption_found = B_TRUE;
 	}
 	zfs_btree_destroy(&sv->sv_pair);
 
 	return (err);
 }
 
 static int
 livelist_block_compare(const void *larg, const void *rarg)
 {
 	const sublivelist_verify_block_t *l = larg;
 	const sublivelist_verify_block_t *r = rarg;
 
 	if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva))
 		return (+1);
 
 	if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva))
 		return (-1);
 	else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva))
 		return (+1);
 
 	return (0);
 }
 
 /*
  * Check for errors in a livelist while tracking all unfreed ALLOCs in the
  * sublivelist_verify_t: sv->sv_leftover
  */
 static void
 livelist_verify(dsl_deadlist_t *dl, void *arg)
 {
 	sublivelist_verify_t *sv = arg;
 	dsl_deadlist_iterate(dl, sublivelist_verify_func, sv);
 }
 
 /*
  * Check for errors in the livelist entry and discard the intermediary
  * data structures
  */
 static int
 sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle)
 {
 	(void) args;
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	int err = sublivelist_verify_func(&sv, dle);
 	zfs_btree_clear(&sv.sv_leftover);
 	zfs_btree_destroy(&sv.sv_leftover);
 	return (err);
 }
 
 typedef struct metaslab_verify {
 	/*
 	 * Tree containing all the leftover ALLOCs from the livelists
 	 * that are part of this metaslab.
 	 */
 	zfs_btree_t mv_livelist_allocs;
 
 	/*
 	 * Metaslab information.
 	 */
 	uint64_t mv_vdid;
 	uint64_t mv_msid;
 	uint64_t mv_start;
 	uint64_t mv_end;
 
 	/*
 	 * What's currently allocated for this metaslab.
 	 */
 	zfs_range_tree_t *mv_allocated;
 } metaslab_verify_t;
 
 typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg);
 
 typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg,
     void *arg);
 
 typedef struct unflushed_iter_cb_arg {
 	spa_t *uic_spa;
 	uint64_t uic_txg;
 	void *uic_arg;
 	zdb_log_sm_cb_t uic_cb;
 } unflushed_iter_cb_arg_t;
 
 static int
 iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg)
 {
 	unflushed_iter_cb_arg_t *uic = arg;
 	return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg));
 }
 
 static void
 iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		unflushed_iter_cb_arg_t uic = {
 			.uic_spa = spa,
 			.uic_txg = sls->sls_txg,
 			.uic_arg = arg,
 			.uic_cb = cb
 		};
 		VERIFY0(space_map_iterate(sm, space_map_length(sm),
 		    iterate_through_spacemap_logs_cb, &uic));
 		space_map_close(sm);
 	}
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static void
 verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg,
     uint64_t offset, uint64_t size)
 {
 	sublivelist_verify_block_t svb = {{{0}}};
 	DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid);
 	DVA_SET_OFFSET(&svb.svb_dva, offset);
 	DVA_SET_ASIZE(&svb.svb_dva, size);
 	zfs_btree_index_t where;
 	uint64_t end_offset = offset + size;
 
 	/*
 	 *  Look for an exact match for spacemap entry in the livelist entries.
 	 *  Then, look for other livelist entries that fall within the range
 	 *  of the spacemap entry as it may have been condensed
 	 */
 	sublivelist_verify_block_t *found =
 	    zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where);
 	if (found == NULL) {
 		found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where);
 	}
 	for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid &&
 	    DVA_GET_OFFSET(&found->svb_dva) < end_offset;
 	    found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		if (found->svb_allocated_txg <= txg) {
 			(void) printf("ERROR: Livelist ALLOC [%llx:%llx] "
 			    "from TXG %llx FREED at TXG %llx\n",
 			    (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva),
 			    (u_longlong_t)found->svb_allocated_txg,
 			    (u_longlong_t)txg);
 			corruption_found = B_TRUE;
 		}
 	}
 }
 
 static int
 metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t txg = sme->sme_txg;
 
 	if (sme->sme_type == SM_ALLOC) {
 		if (zfs_range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE ALLOC: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 			corruption_found = B_TRUE;
 		} else {
 			zfs_range_tree_add(mv->mv_allocated,
 			    offset, size);
 		}
 	} else {
 		if (!zfs_range_tree_contains(mv->mv_allocated,
 		    offset, size)) {
 			(void) printf("ERROR: DOUBLE FREE: "
 			    "%llu [%llx:%llx] "
 			    "%llu:%llu LOG_SM\n",
 			    (u_longlong_t)txg, (u_longlong_t)offset,
 			    (u_longlong_t)size, (u_longlong_t)mv->mv_vdid,
 			    (u_longlong_t)mv->mv_msid);
 			corruption_found = B_TRUE;
 		} else {
 			zfs_range_tree_remove(mv->mv_allocated,
 			    offset, size);
 		}
 	}
 
 	if (sme->sme_type != SM_ALLOC) {
 		/*
 		 * If something is freed in the spacemap, verify that
 		 * it is not listed as allocated in the livelist.
 		 */
 		verify_livelist_allocs(mv, txg, offset, size);
 	}
 	return (0);
 }
 
 static int
 spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	metaslab_verify_t *mv = arg;
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	if (vdev_id != mv->mv_vdid)
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	if (ms->ms_id != mv->mv_msid)
 		return (0);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 
 	ASSERT3U(txg, ==, sme->sme_txg);
 	return (metaslab_spacemap_validation_cb(sme, mv));
 }
 
 static void
 spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv)
 {
 	iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv);
 }
 
 static void
 spacemap_check_ms_sm(space_map_t  *sm, metaslab_verify_t *mv)
 {
 	if (sm == NULL)
 		return;
 
 	VERIFY0(space_map_iterate(sm, space_map_length(sm),
 	    metaslab_spacemap_validation_cb, mv));
 }
 
 static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg);
 
 /*
  * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if
  * they are part of that metaslab (mv_msid).
  */
 static void
 mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv)
 {
 	zfs_btree_index_t where;
 	sublivelist_verify_block_t *svb;
 	ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0);
 	for (svb = zfs_btree_first(&sv->sv_leftover, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) {
 		if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start &&
 		    (DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			corruption_found = B_TRUE;
 			continue;
 		}
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start)
 			continue;
 
 		if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end)
 			continue;
 
 		if ((DVA_GET_OFFSET(&svb->svb_dva) +
 		    DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) {
 			(void) printf("ERROR: Found block that crosses "
 			    "metaslab boundary: <%llu:%llx:%llx>\n",
 			    (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 			    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva));
 			corruption_found = B_TRUE;
 			continue;
 		}
 
 		zfs_btree_add(&mv->mv_livelist_allocs, svb);
 	}
 
 	for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where);
 	    svb != NULL;
 	    svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) {
 		zfs_btree_remove(&sv->sv_leftover, svb);
 	}
 }
 
 /*
  * [Livelist Check]
  * Iterate through all the sublivelists and:
  * - report leftover frees (**)
  * - record leftover ALLOCs together with their TXG [see Cross Check]
  *
  * (**) Note: Double ALLOCs are valid in datasets that have dedup
  *      enabled. Similarly double FREEs are allowed as well but
  *      only if they pair up with a corresponding ALLOC entry once
  *      we our done with our sublivelist iteration.
  *
  * [Spacemap Check]
  * for each metaslab:
  * - iterate over spacemap and then the metaslab's entries in the
  *   spacemap log, then report any double FREEs and ALLOCs (do not
  *   blow up).
  *
  * [Cross Check]
  * After finishing the Livelist Check phase and while being in the
  * Spacemap Check phase, we find all the recorded leftover ALLOCs
  * of the livelist check that are part of the metaslab that we are
  * currently looking at in the Spacemap Check. We report any entries
  * that are marked as ALLOCs in the livelists but have been actually
  * freed (and potentially allocated again) after their TXG stamp in
  * the spacemaps. Also report any ALLOCs from the livelists that
  * belong to indirect vdevs (e.g. their vdev completed removal).
  *
  * Note that this will miss Log Spacemap entries that cancelled each other
  * out before being flushed to the metaslab, so we are not guaranteed
  * to match all erroneous ALLOCs.
  */
 static void
 livelist_metaslab_validate(spa_t *spa)
 {
 	(void) printf("Verifying deleted livelist entries\n");
 
 	sublivelist_verify_t sv;
 	zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL,
 	    sizeof (sublivelist_verify_block_t));
 	iterate_deleted_livelists(spa, livelist_verify, &sv);
 
 	(void) printf("Verifying metaslab entries\n");
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (!vdev_is_concrete(vd))
 			continue;
 
 		for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) {
 			metaslab_t *m = vd->vdev_ms[mid];
 
 			(void) fprintf(stderr,
 			    "\rverifying concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)mid,
 			    (longlong_t)vd->vdev_ms_count);
 
 			uint64_t shift, start;
 			zfs_range_seg_type_t type =
 			    metaslab_calculate_range_tree_type(vd, m,
 			    &start, &shift);
 			metaslab_verify_t mv;
 			mv.mv_allocated = zfs_range_tree_create_flags(
 			    NULL, type, NULL, start, shift,
 			    0, "livelist_metaslab_validate:mv_allocated");
 			mv.mv_vdid = vd->vdev_id;
 			mv.mv_msid = m->ms_id;
 			mv.mv_start = m->ms_start;
 			mv.mv_end = m->ms_start + m->ms_size;
 			zfs_btree_create(&mv.mv_livelist_allocs,
 			    livelist_block_compare, NULL,
 			    sizeof (sublivelist_verify_block_t));
 
 			mv_populate_livelist_allocs(&mv, &sv);
 
 			spacemap_check_ms_sm(m->ms_sm, &mv);
 			spacemap_check_sm_log(spa, &mv);
 
 			zfs_range_tree_vacate(mv.mv_allocated, NULL, NULL);
 			zfs_range_tree_destroy(mv.mv_allocated);
 			zfs_btree_clear(&mv.mv_livelist_allocs);
 			zfs_btree_destroy(&mv.mv_livelist_allocs);
 		}
 	}
 	(void) fprintf(stderr, "\n");
 
 	/*
 	 * If there are any segments in the leftover tree after we walked
 	 * through all the metaslabs in the concrete vdevs then this means
 	 * that we have segments in the livelists that belong to indirect
 	 * vdevs and are marked as allocated.
 	 */
 	if (zfs_btree_numnodes(&sv.sv_leftover) == 0) {
 		zfs_btree_destroy(&sv.sv_leftover);
 		return;
 	}
 	(void) printf("ERROR: Found livelist blocks marked as allocated "
 	    "for indirect vdevs:\n");
 	corruption_found = B_TRUE;
 
 	zfs_btree_index_t *where = NULL;
 	sublivelist_verify_block_t *svb;
 	while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) !=
 	    NULL) {
 		int vdev_id = DVA_GET_VDEV(&svb->svb_dva);
 		ASSERT3U(vdev_id, <, rvd->vdev_children);
 		vdev_t *vd = rvd->vdev_child[vdev_id];
 		ASSERT(!vdev_is_concrete(vd));
 		(void) printf("<%d:%llx:%llx> TXG %llx\n",
 		    vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva),
 		    (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva),
 		    (u_longlong_t)svb->svb_allocated_txg);
 	}
 	(void) printf("\n");
 	zfs_btree_destroy(&sv.sv_leftover);
 }
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
  * debugging facilities.
  */
 const char *
 _umem_debug_init(void)
 {
 	return ("default,verbose"); /* $UMEM_DEBUG setting */
 }
 
 const char *
 _umem_logging_init(void)
 {
 	return ("fail,contents"); /* $UMEM_LOGGING setting */
 }
 
 static void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p <path> ...]] "
 	    "[-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]]\n"
 	    "\t%s [-AdiPv] [-e [-V] [-p <path> ...]] [-U <cache>] [-K <key>]\n"
 	    "\t\t[<poolname>[/<dataset | objset id>] [<object | range> ...]\n"
 	    "\t%s -B [-e [-V] [-p <path> ...]] [-I <inflight I/Os>]\n"
 	    "\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
 	    "\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
 	    "\t%s [-v] <bookmark>\n"
 	    "\t%s -C [-A] [-U <cache>] [<poolname>]\n"
 	    "\t%s -l [-Aqu] <device>\n"
 	    "\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
 	    "[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
 	    "\t%s -O [-K <key>] <dataset> <path>\n"
 	    "\t%s -r [-K <key>] <dataset> <path> <destination>\n"
 	    "\t%s -R [-A] [-e [-V] [-p <path> ...]] [-U <cache>]\n"
 	    "\t\t<poolname> <vdev>:<offset>:<size>[:<flags>]\n"
 	    "\t%s -E [-A] word0:word1:...:word15\n"
 	    "\t%s -S [-AP] [-e [-V] [-p <path> ...]] [-U <cache>] "
 	    "<poolname>\n\n",
 	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname,
 	    cmdname, cmdname, cmdname, cmdname, cmdname);
 
 	(void) fprintf(stderr, "    Dataset name must include at least one "
 	    "separator character '/' or '@'\n");
 	(void) fprintf(stderr, "    If dataset name is specified, only that "
 	    "dataset is dumped\n");
 	(void) fprintf(stderr,  "    If object numbers or object number "
 	    "ranges are specified, only those\n"
 	    "    objects or ranges are dumped.\n\n");
 	(void) fprintf(stderr,
 	    "    Object ranges take the form <start>:<end>[:<flags>]\n"
 	    "        start    Starting object number\n"
 	    "        end      Ending object number, or -1 for no upper bound\n"
 	    "        flags    Optional flags to select object types:\n"
 	    "            A     All objects (this is the default)\n"
 	    "            d     ZFS directories\n"
 	    "            f     ZFS files \n"
 	    "            m     SPA space maps\n"
 	    "            z     ZAPs\n"
 	    "            -     Negate effect of next flag\n\n");
 	(void) fprintf(stderr, "    Options to control amount of output:\n");
 	(void) fprintf(stderr, "        -b --block-stats             "
 	    "block statistics\n");
 	(void) fprintf(stderr, "        -B --backup                  "
 	    "backup stream\n");
 	(void) fprintf(stderr, "        -c --checksum                "
 	    "checksum all metadata (twice for all data) blocks\n");
 	(void) fprintf(stderr, "        -C --config                  "
 	    "config (or cachefile if alone)\n");
 	(void) fprintf(stderr, "        -d --datasets                "
 	    "dataset(s)\n");
 	(void) fprintf(stderr, "        -D --dedup-stats             "
 	    "dedup statistics\n");
 	(void) fprintf(stderr, "        -E --embedded-block-pointer=INTEGER\n"
 	    "                                     decode and display block "
 	    "from an embedded block pointer\n");
 	(void) fprintf(stderr, "        -h --history                 "
 	    "pool history\n");
 	(void) fprintf(stderr, "        -i --intent-logs             "
 	    "intent logs\n");
 	(void) fprintf(stderr, "        -l --label                   "
 	    "read label contents\n");
 	(void) fprintf(stderr, "        -k --checkpointed-state      "
 	    "examine the checkpointed state of the pool\n");
 	(void) fprintf(stderr, "        -L --disable-leak-tracking   "
 	    "disable leak tracking (do not load spacemaps)\n");
 	(void) fprintf(stderr, "        -m --metaslabs               "
 	    "metaslabs\n");
 	(void) fprintf(stderr, "        -M --metaslab-groups         "
 	    "metaslab groups\n");
 	(void) fprintf(stderr, "        -O --object-lookups          "
 	    "perform object lookups by path\n");
 	(void) fprintf(stderr, "        -r --copy-object             "
 	    "copy an object by path to file\n");
 	(void) fprintf(stderr, "        -R --read-block              "
 	    "read and display block from a device\n");
 	(void) fprintf(stderr, "        -s --io-stats                "
 	    "report stats on zdb's I/O\n");
 	(void) fprintf(stderr, "        -S --simulate-dedup          "
 	    "simulate dedup to measure effect\n");
 	(void) fprintf(stderr, "        -v --verbose                 "
 	    "verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -y --livelist                "
 	    "perform livelist and metaslab validation on any livelists being "
 	    "deleted\n\n");
 	(void) fprintf(stderr, "    Below options are intended for use "
 	    "with other options:\n");
 	(void) fprintf(stderr, "        -A --ignore-assertions       "
 	    "ignore assertions (-A), enable panic recovery (-AA) or both "
 	    "(-AAA)\n");
 	(void) fprintf(stderr, "        -e --exported                "
 	    "pool is exported/destroyed/has altroot/not in a cachefile\n");
 	(void) fprintf(stderr, "        -F --automatic-rewind        "
 	    "attempt automatic rewind within safe range of transaction "
 	    "groups\n");
 	(void) fprintf(stderr, "        -G --dump-debug-msg          "
 	    "dump zfs_dbgmsg buffer before exiting\n");
 	(void) fprintf(stderr, "        -I --inflight=INTEGER        "
 	    "specify the maximum number of checksumming I/Os "
 	    "[default is 200]\n");
 	(void) fprintf(stderr, "        -K --key=KEY                 "
 	    "decryption key for encrypted dataset\n");
 	(void) fprintf(stderr, "        -o --option=\"NAME=VALUE\" "
 	    "set the named tunable to the given value\n");
 	(void) fprintf(stderr, "        -p --path==PATH              "
 	    "use one or more with -e to specify path to vdev dir\n");
 	(void) fprintf(stderr, "        -P --parseable               "
 	    "print numbers in parseable form\n");
 	(void) fprintf(stderr, "        -q --skip-label              "
 	    "don't print label contents\n");
 	(void) fprintf(stderr, "        -t --txg=INTEGER             "
 	    "highest txg to use when searching for uberblocks\n");
 	(void) fprintf(stderr, "        -T --brt-stats               "
 	    "BRT statistics\n");
 	(void) fprintf(stderr, "        -u --uberblock               "
 	    "uberblock\n");
 	(void) fprintf(stderr, "        -U --cachefile=PATH          "
 	    "use alternate cachefile\n");
 	(void) fprintf(stderr, "        -V --verbatim                "
 	    "do verbatim import\n");
 	(void) fprintf(stderr, "        -x --dump-blocks=PATH        "
 	    "dump all read blocks into specified directory\n");
 	(void) fprintf(stderr, "        -X --extreme-rewind          "
 	    "attempt extreme rewind (does not work with dataset)\n");
 	(void) fprintf(stderr, "        -Y --all-reconstruction      "
 	    "attempt all reconstruction combinations for split blocks\n");
 	(void) fprintf(stderr, "        -Z --zstd-headers            "
 	    "show ZSTD headers \n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	zdb_exit(2);
 }
 
 static void
 dump_debug_buffer(void)
 {
 	ssize_t ret __attribute__((unused));
 
 	if (!dump_opt['G'])
 		return;
 	/*
 	 * We use write() instead of printf() so that this function
 	 * is safe to call from a signal handler.
 	 */
 	ret = write(STDERR_FILENO, "\n", 1);
 	zfs_dbgmsg_print(STDERR_FILENO, "zdb");
 }
 
 static void sig_handler(int signo)
 {
 	struct sigaction action;
 
 	libspl_backtrace(STDERR_FILENO);
 	dump_debug_buffer();
 
 	/*
 	 * Restore default action and re-raise signal so SIGSEGV and
 	 * SIGABRT can trigger a core dump.
 	 */
 	action.sa_handler = SIG_DFL;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	(void) sigaction(signo, &action, NULL);
 	raise(signo);
 }
 
 /*
  * Called for usage errors that are discovered after a call to spa_open(),
  * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
  */
 
 static void
 fatal(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	(void) fprintf(stderr, "%s: ", cmdname);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
 	dump_debug_buffer();
 
 	zdb_exit(1);
 }
 
 static void
 dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) size;
 	nvlist_t *nv;
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
 	VERIFY0(dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY0(nvlist_unpack(packed, nvsize, &nv, 0));
 
 	umem_free(packed, nvsize);
 
 	dump_nvlist(nv, 8);
 
 	nvlist_free(nv);
 }
 
 static void
 dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) size;
 	spa_history_phys_t *shp = data;
 
 	if (shp == NULL)
 		return;
 
 	(void) printf("\t\tpool_create_len = %llu\n",
 	    (u_longlong_t)shp->sh_pool_create_len);
 	(void) printf("\t\tphys_max_off = %llu\n",
 	    (u_longlong_t)shp->sh_phys_max_off);
 	(void) printf("\t\tbof = %llu\n",
 	    (u_longlong_t)shp->sh_bof);
 	(void) printf("\t\teof = %llu\n",
 	    (u_longlong_t)shp->sh_eof);
 	(void) printf("\t\trecords_lost = %llu\n",
 	    (u_longlong_t)shp->sh_records_lost);
 }
 
 static void
 zdb_nicenum(uint64_t num, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)num);
 	else
 		nicenum(num, buf, buflen);
 }
 
 static void
 zdb_nicebytes(uint64_t bytes, char *buf, size_t buflen)
 {
 	if (dump_opt['P'])
 		(void) snprintf(buf, buflen, "%llu", (longlong_t)bytes);
 	else
 		zfs_nicebytes(bytes, buf, buflen);
 }
 
 static const char histo_stars[] = "****************************************";
 static const uint64_t histo_width = sizeof (histo_stars) - 1;
 
 static void
 dump_histogram(const uint64_t *histo, int size, int offset)
 {
 	int i;
 	int minidx = size - 1;
 	int maxidx = 0;
 	uint64_t max = 0;
 
 	for (i = 0; i < size; i++) {
 		if (histo[i] == 0)
 			continue;
 		if (histo[i] > max)
 			max = histo[i];
 		if (i > maxidx)
 			maxidx = i;
 		if (i < minidx)
 			minidx = i;
 	}
 
 	if (max < histo_width)
 		max = histo_width;
 
 	for (i = minidx; i <= maxidx; i++) {
 		(void) printf("\t\t\t%3u: %6llu %s\n",
 		    i + offset, (u_longlong_t)histo[i],
 		    &histo_stars[(max - histo[i]) * histo_width / max]);
 	}
 }
 
 static void
 dump_zap_stats(objset_t *os, uint64_t object)
 {
 	int error;
 	zap_stats_t zs;
 
 	error = zap_get_stats(os, object, &zs);
 	if (error)
 		return;
 
 	if (zs.zs_ptrtbl_len == 0) {
 		ASSERT(zs.zs_num_blocks == 1);
 		(void) printf("\tmicrozap: %llu bytes, %llu entries\n",
 		    (u_longlong_t)zs.zs_blocksize,
 		    (u_longlong_t)zs.zs_num_entries);
 		return;
 	}
 
 	(void) printf("\tFat ZAP stats:\n");
 
 	(void) printf("\t\tPointer table:\n");
 	(void) printf("\t\t\t%llu elements\n",
 	    (u_longlong_t)zs.zs_ptrtbl_len);
 	(void) printf("\t\t\tzt_blk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_blk);
 	(void) printf("\t\t\tzt_numblks: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_numblks);
 	(void) printf("\t\t\tzt_shift: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_zt_shift);
 	(void) printf("\t\t\tzt_blks_copied: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_blks_copied);
 	(void) printf("\t\t\tzt_nextblk: %llu\n",
 	    (u_longlong_t)zs.zs_ptrtbl_nextblk);
 
 	(void) printf("\t\tZAP entries: %llu\n",
 	    (u_longlong_t)zs.zs_num_entries);
 	(void) printf("\t\tLeaf blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_leafs);
 	(void) printf("\t\tTotal blocks: %llu\n",
 	    (u_longlong_t)zs.zs_num_blocks);
 	(void) printf("\t\tzap_block_type: 0x%llx\n",
 	    (u_longlong_t)zs.zs_block_type);
 	(void) printf("\t\tzap_magic: 0x%llx\n",
 	    (u_longlong_t)zs.zs_magic);
 	(void) printf("\t\tzap_salt: 0x%llx\n",
 	    (u_longlong_t)zs.zs_salt);
 
 	(void) printf("\t\tLeafs with 2^n pointers:\n");
 	dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks with n*5 entries:\n");
 	dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBlocks n/10 full:\n");
 	dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tEntries with n chunks:\n");
 	dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0);
 
 	(void) printf("\t\tBuckets with n entries:\n");
 	dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 	(void) printf("\tUNKNOWN OBJECT TYPE\n");
 }
 
 static void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_uint64(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	uint64_t *arr;
 	uint64_t oursize;
 	if (dump_opt['d'] < 6)
 		return;
 
 	if (data == NULL) {
 		dmu_object_info_t doi;
 
 		VERIFY0(dmu_object_info(os, object, &doi));
 		size = doi.doi_max_offset;
 		/*
 		 * We cap the size at 1 mebibyte here to prevent
 		 * allocation failures and nigh-infinite printing if the
 		 * object is extremely large.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = kmem_alloc(oursize, KM_SLEEP);
 
 		int err = dmu_read(os, object, 0, oursize, arr, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(arr, oursize);
 			return;
 		}
 	} else {
 		/*
 		 * Even though the allocation is already done in this code path,
 		 * we still cap the size to prevent excessive printing.
 		 */
 		oursize = MIN(size, 1 << 20);
 		arr = data;
 	}
 
 	if (size == 0) {
 		if (data == NULL)
 			kmem_free(arr, oursize);
 		(void) printf("\t\t[]\n");
 		return;
 	}
 
 	(void) printf("\t\t[%0llx", (u_longlong_t)arr[0]);
 	for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) {
 		if (i % 4 != 0)
 			(void) printf(", %0llx", (u_longlong_t)arr[i]);
 		else
 			(void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]);
 	}
 	if (oursize != size)
 		(void) printf(", ... ");
 	(void) printf("]\n");
 
 	if (data == NULL)
 		kmem_free(arr, oursize);
 }
 
 static void
 dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_long_alloc();
 	void *prop;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		boolean_t key64 =
 		    !!(zap_getflags(zc.zc_zap) & ZAP_FLAG_UINT64_KEY);
 
 		if (key64)
 			(void) printf("\t\t0x%010" PRIu64 "x = ",
 			    *(uint64_t *)attrp->za_name);
 		else
 			(void) printf("\t\t%s = ", attrp->za_name);
 
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		prop = umem_zalloc(attrp->za_num_integers *
 		    attrp->za_integer_length, UMEM_NOFAIL);
 
 		if (key64)
 			(void) zap_lookup_uint64(os, object,
 			    (const uint64_t *)attrp->za_name, 1,
 			    attrp->za_integer_length, attrp->za_num_integers,
 			    prop);
 		else
 			(void) zap_lookup(os, object, attrp->za_name,
 			    attrp->za_integer_length, attrp->za_num_integers,
 			    prop);
 
 		if (attrp->za_integer_length == 1 && !key64) {
 			if (strcmp(attrp->za_name,
 			    DSL_CRYPTO_KEY_MASTER_KEY) == 0 ||
 			    strcmp(attrp->za_name,
 			    DSL_CRYPTO_KEY_HMAC_KEY) == 0 ||
 			    strcmp(attrp->za_name, DSL_CRYPTO_KEY_IV) == 0 ||
 			    strcmp(attrp->za_name, DSL_CRYPTO_KEY_MAC) == 0 ||
 			    strcmp(attrp->za_name,
 			    DMU_POOL_CHECKSUM_SALT) == 0) {
 				uint8_t *u8 = prop;
 
 				for (i = 0; i < attrp->za_num_integers; i++) {
 					(void) printf("%02x", u8[i]);
 				}
 			} else {
 				(void) printf("%s", (char *)prop);
 			}
 		} else {
 			for (i = 0; i < attrp->za_num_integers; i++) {
 				switch (attrp->za_integer_length) {
 				case 1:
 					(void) printf("%u ",
 					    ((uint8_t *)prop)[i]);
 					break;
 				case 2:
 					(void) printf("%u ",
 					    ((uint16_t *)prop)[i]);
 					break;
 				case 4:
 					(void) printf("%u ",
 					    ((uint32_t *)prop)[i]);
 					break;
 				case 8:
 					(void) printf("%lld ",
 					    (u_longlong_t)((int64_t *)prop)[i]);
 					break;
 				}
 			}
 		}
 		(void) printf("\n");
 		umem_free(prop,
 		    attrp->za_num_integers * attrp->za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	bpobj_phys_t *bpop = data;
 	uint64_t i;
 	char bytes[32], comp[32], uncomp[32];
 
 	/* make sure the output won't get truncated */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (bpop == NULL)
 		return;
 
 	zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes));
 	zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp));
 	zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp));
 
 	(void) printf("\t\tnum_blkptrs = %llu\n",
 	    (u_longlong_t)bpop->bpo_num_blkptrs);
 	(void) printf("\t\tbytes = %s\n", bytes);
 	if (size >= BPOBJ_SIZE_V1) {
 		(void) printf("\t\tcomp = %s\n", comp);
 		(void) printf("\t\tuncomp = %s\n", uncomp);
 	}
 	if (size >= BPOBJ_SIZE_V2) {
 		(void) printf("\t\tsubobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_subobjs);
 		(void) printf("\t\tnum_subobjs = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_subobjs);
 	}
 	if (size >= sizeof (*bpop)) {
 		(void) printf("\t\tnum_freed = %llu\n",
 		    (u_longlong_t)bpop->bpo_num_freed);
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	for (i = 0; i < bpop->bpo_num_blkptrs; i++) {
 		char blkbuf[BP_SPRINTF_LEN];
 		blkptr_t bp;
 
 		int err = dmu_read(os, object,
 		    i * sizeof (bp), sizeof (bp), &bp, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			break;
 		}
 		snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp,
 		    BP_GET_FREE(&bp));
 		(void) printf("\t%s\n", blkbuf);
 	}
 }
 
 static void
 dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dmu_object_info_t doi;
 	int64_t i;
 
 	VERIFY0(dmu_object_info(os, object, &doi));
 	uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP);
 
 	int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0);
 	if (err != 0) {
 		(void) printf("got error %u from dmu_read\n", err);
 		kmem_free(subobjs, doi.doi_max_offset);
 		return;
 	}
 
 	int64_t last_nonzero = -1;
 	for (i = 0; i < doi.doi_max_offset / 8; i++) {
 		if (subobjs[i] != 0)
 			last_nonzero = i;
 	}
 
 	for (i = 0; i <= last_nonzero; i++) {
 		(void) printf("\t%llu\n", (u_longlong_t)subobjs[i]);
 	}
 	kmem_free(subobjs, doi.doi_max_offset);
 }
 
 static void
 dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	dump_zap_stats(os, object);
 	/* contents are printed elsewhere, properly decoded */
 }
 
 static void
 dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = ", attrp->za_name);
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 		(void) printf(" %llx : [%d:%d:%d]\n",
 		    (u_longlong_t)attrp->za_first_integer,
 		    (int)ATTR_LENGTH(attrp->za_first_integer),
 		    (int)ATTR_BSWAP(attrp->za_first_integer),
 		    (int)ATTR_NUM(attrp->za_first_integer));
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 	uint16_t *layout_attrs;
 	unsigned i;
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = [", attrp->za_name);
 		if (attrp->za_num_integers == 0) {
 			(void) printf("\n");
 			continue;
 		}
 
 		VERIFY(attrp->za_integer_length == 2);
 		layout_attrs = umem_zalloc(attrp->za_num_integers *
 		    attrp->za_integer_length, UMEM_NOFAIL);
 
 		VERIFY(zap_lookup(os, object, attrp->za_name,
 		    attrp->za_integer_length,
 		    attrp->za_num_integers, layout_attrs) == 0);
 
 		for (i = 0; i != attrp->za_num_integers; i++)
 			(void) printf(" %d ", (int)layout_attrs[i]);
 		(void) printf("]\n");
 		umem_free(layout_attrs,
 		    attrp->za_num_integers * attrp->za_integer_length);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_long_alloc();
 	const char *typenames[] = {
 		/* 0 */ "not specified",
 		/* 1 */ "FIFO",
 		/* 2 */ "Character Device",
 		/* 3 */ "3 (invalid)",
 		/* 4 */ "Directory",
 		/* 5 */ "5 (invalid)",
 		/* 6 */ "Block Device",
 		/* 7 */ "7 (invalid)",
 		/* 8 */ "Regular File",
 		/* 9 */ "9 (invalid)",
 		/* 10 */ "Symbolic Link",
 		/* 11 */ "11 (invalid)",
 		/* 12 */ "Socket",
 		/* 13 */ "Door",
 		/* 14 */ "Event Port",
 		/* 15 */ "15 (invalid)",
 	};
 
 	dump_zap_stats(os, object);
 	(void) printf("\n");
 
 	for (zap_cursor_init(&zc, os, object);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		(void) printf("\t\t%s = %lld (type: %s)\n",
 		    attrp->za_name, ZFS_DIRENT_OBJ(attrp->za_first_integer),
 		    typenames[ZFS_DIRENT_TYPE(attrp->za_first_integer)]);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static int
 get_dtl_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		space_map_t *sm = vd->vdev_dtl_sm;
 
 		if (sm != NULL &&
 		    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 			return (1);
 		return (0);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_dtl_refcount(vd->vdev_child[c]);
 	return (refcount);
 }
 
 static int
 get_metaslab_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd) {
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			space_map_t *sm = vd->vdev_ms[m]->ms_sm;
 
 			if (sm != NULL &&
 			    sm->sm_dbuf->db_size == sizeof (space_map_phys_t))
 				refcount++;
 		}
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		refcount += get_metaslab_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_obsolete_refcount(vdev_t *vd)
 {
 	uint64_t obsolete_sm_object;
 	int refcount = 0;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (vd->vdev_top == vd && obsolete_sm_object != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset,
 		    obsolete_sm_object, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			refcount++;
 		}
 	} else {
 		ASSERT0P(vd->vdev_obsolete_sm);
 		ASSERT0(obsolete_sm_object);
 	}
 	for (unsigned c = 0; c < vd->vdev_children; c++) {
 		refcount += get_obsolete_refcount(vd->vdev_child[c]);
 	}
 
 	return (refcount);
 }
 
 static int
 get_prev_obsolete_spacemap_refcount(spa_t *spa)
 {
 	uint64_t prev_obj =
 	    spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object;
 	if (prev_obj != 0) {
 		dmu_object_info_t doi;
 		VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi));
 		if (doi.doi_bonus_size == sizeof (space_map_phys_t)) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 static int
 get_checkpoint_refcount(vdev_t *vd)
 {
 	int refcount = 0;
 
 	if (vd->vdev_top == vd && vd->vdev_top_zap != 0 &&
 	    zap_contains(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0)
 		refcount++;
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		refcount += get_checkpoint_refcount(vd->vdev_child[c]);
 
 	return (refcount);
 }
 
 static int
 get_log_spacemap_refcount(spa_t *spa)
 {
 	return (avl_numnodes(&spa->spa_sm_logs_by_txg));
 }
 
 static int
 verify_spacemap_refcounts(spa_t *spa)
 {
 	uint64_t expected_refcount = 0;
 	uint64_t actual_refcount;
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM],
 	    &expected_refcount);
 	actual_refcount = get_dtl_refcount(spa->spa_root_vdev);
 	actual_refcount += get_metaslab_refcount(spa->spa_root_vdev);
 	actual_refcount += get_obsolete_refcount(spa->spa_root_vdev);
 	actual_refcount += get_prev_obsolete_spacemap_refcount(spa);
 	actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev);
 	actual_refcount += get_log_spacemap_refcount(spa);
 
 	if (expected_refcount != actual_refcount) {
 		(void) printf("space map refcount mismatch: expected %lld != "
 		    "actual %lld\n",
 		    (longlong_t)expected_refcount,
 		    (longlong_t)actual_refcount);
 		return (2);
 	}
 	return (0);
 }
 
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
 	const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
 	if (sm == NULL)
 		return;
 
 	(void) printf("space map object %llu:\n",
 	    (longlong_t)sm->sm_object);
 	(void) printf("  smp_length = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_length);
 	(void) printf("  smp_alloc = 0x%llx\n",
 	    (longlong_t)sm->sm_phys->smp_alloc);
 
 	if (dump_opt['d'] < 6 && dump_opt['m'] < 4)
 		return;
 
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
 	uint8_t mapshift = sm->sm_shift;
 	int64_t alloc = 0;
 	uint64_t word, entry_id = 0;
 	for (uint64_t offset = 0; offset < space_map_length(sm);
 	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
 		    sizeof (word), &word, DMU_READ_PREFETCH));
 
 		if (sm_entry_is_debug(word)) {
 			uint64_t de_txg = SM_DEBUG_TXG_DECODE(word);
 			uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word);
 			if (de_txg == 0) {
 				(void) printf(
 				    "\t    [%6llu] PADDING\n",
 				    (u_longlong_t)entry_id);
 			} else {
 				(void) printf(
 				    "\t    [%6llu] %s: txg %llu pass %llu\n",
 				    (u_longlong_t)entry_id,
 				    ddata[SM_DEBUG_ACTION_DECODE(word)],
 				    (u_longlong_t)de_txg,
 				    (u_longlong_t)de_sync_pass);
 			}
 			entry_id++;
 			continue;
 		}
 
 		char entry_type;
 		uint64_t entry_off, entry_run, entry_vdev;
 
 		if (sm_entry_is_single_word(word)) {
 			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
 			    sm->sm_start;
 			entry_run = SM_RUN_DECODE(word) << mapshift;
 
 			(void) printf("\t    [%6llu] %c "
 			    "range: %012llx-%012llx size: %08llx\n",
 			    (u_longlong_t)entry_id, entry_type,
 			    (u_longlong_t)entry_off,
 			    (u_longlong_t)(entry_off + entry_run - 1),
 			    (u_longlong_t)entry_run);
 		} else {
 			/* it is a two-word entry so we read another word */
 			ASSERT(sm_entry_is_double_word(word));
 
 			uint64_t extra_word;
 			offset += sizeof (extra_word);
 			ASSERT3U(offset, <, space_map_length(sm));
 			VERIFY0(dmu_read(os, space_map_object(sm), offset,
 			    sizeof (extra_word), &extra_word,
 			    DMU_READ_PREFETCH));
 
 			entry_run = SM2_RUN_DECODE(word) << mapshift;
 			entry_vdev = SM2_VDEV_DECODE(word);
 			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
 			    'A' : 'F';
 			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
 			    mapshift) + sm->sm_start;
 
 			if (zopt_metaslab_args == 0 ||
 			    zopt_metaslab[0] == entry_vdev) {
 				(void) printf("\t    [%6llu] %c "
 				    "range: %012llx-%012llx size: %08llx "
 				    "vdev: %llu\n",
 				    (u_longlong_t)entry_id, entry_type,
 				    (u_longlong_t)entry_off,
 				    (u_longlong_t)(entry_off + entry_run - 1),
 				    (u_longlong_t)entry_run,
 				    (u_longlong_t)entry_vdev);
 			}
 		}
 
 		if (entry_type == 'A')
 			alloc += entry_run;
 		else
 			alloc -= entry_run;
 		entry_id++;
 	}
 	if (alloc != space_map_allocated(sm)) {
 		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
 		    "with space map summary (%lld)\n",
 		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
 static void
 dump_metaslab_stats(metaslab_t *msp)
 {
 	char maxbuf[32];
 	zfs_range_tree_t *rt = msp->ms_allocatable;
 	zfs_btree_t *t = &msp->ms_allocatable_by_size;
 	int free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size;
 
 	/* max sure nicenum has enough space */
 	_Static_assert(sizeof (maxbuf) >= NN_NUMBUF_SZ, "maxbuf truncated");
 
 	zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf));
 
 	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
 	    "segments", zfs_btree_numnodes(t), "maxsize", maxbuf,
 	    "freepct", free_pct);
 	(void) printf("\tIn-memory histogram:\n");
 	dump_histogram(rt->rt_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 dump_metaslab(metaslab_t *msp)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *sm = msp->ms_sm;
 	char freebuf[32];
 
 	zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf,
 	    sizeof (freebuf));
 
 	(void) printf(
 	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
 	    (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start,
 	    (u_longlong_t)space_map_object(sm), freebuf);
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
 		VERIFY0(metaslab_load(msp));
 		zfs_range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	if (dump_opt['m'] > 1 && sm != NULL &&
 	    spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
 		/*
 		 * The space map histogram represents free space in chunks
 		 * of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
 		 */
 		(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
 		    (u_longlong_t)msp->ms_fragmentation);
 		dump_histogram(sm->sm_phys->smp_histogram,
 		    SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
 	}
 
 	if (vd->vdev_ops == &vdev_draid_ops)
 		ASSERT3U(msp->ms_size, <=, 1ULL << vd->vdev_ms_shift);
 	else
 		ASSERT3U(msp->ms_size, ==, 1ULL << vd->vdev_ms_shift);
 
 	dump_spacemap(spa->spa_meta_objset, msp->ms_sm);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 		(void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n",
 		    (u_longlong_t)metaslab_unflushed_txg(msp));
 	}
 }
 
 static void
 print_vdev_metaslab_header(vdev_t *vd)
 {
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *bias_str = "";
 	if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) {
 		bias_str = VDEV_ALLOC_BIAS_LOG;
 	} else if (alloc_bias == VDEV_BIAS_SPECIAL) {
 		bias_str = VDEV_ALLOC_BIAS_SPECIAL;
 	} else if (alloc_bias == VDEV_BIAS_DEDUP) {
 		bias_str = VDEV_ALLOC_BIAS_DEDUP;
 	}
 
 	uint64_t ms_flush_data_obj = 0;
 	if (vd->vdev_top_zap != 0) {
 		int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 		    sizeof (uint64_t), 1, &ms_flush_data_obj);
 		if (error != ENOENT) {
 			ASSERT0(error);
 		}
 	}
 
 	(void) printf("\tvdev %10llu   %s",
 	    (u_longlong_t)vd->vdev_id, bias_str);
 
 	if (ms_flush_data_obj != 0) {
 		(void) printf("   ms_unflushed_phys object %llu",
 		    (u_longlong_t)ms_flush_data_obj);
 	}
 
 	(void) printf("\n\t%-10s%5llu   %-19s   %-15s   %-12s\n",
 	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
 	    "offset", "spacemap", "free");
 	(void) printf("\t%15s   %19s   %15s   %12s\n",
 	    "---------------", "-------------------",
 	    "---------------", "------------");
 }
 
 static void
 dump_metaslab_groups(spa_t *spa, boolean_t show_special)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_class_t *mc = spa_normal_class(spa);
 	metaslab_class_t *smc = spa_special_class(spa);
 	uint64_t fragmentation;
 
 	metaslab_class_histogram_verify(mc);
 
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (mg == NULL || (mg->mg_class != mc &&
 		    (!show_special || mg->mg_class != smc)))
 			continue;
 
 		metaslab_group_histogram_verify(mg);
 		mg->mg_fragmentation = metaslab_group_fragmentation(mg);
 
 		(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
 		    "fragmentation",
 		    (u_longlong_t)tvd->vdev_id,
 		    (u_longlong_t)tvd->vdev_ms_count);
 		if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
 			(void) printf("%3s\n", "-");
 		} else {
 			(void) printf("%3llu%%\n",
 			    (u_longlong_t)mg->mg_fragmentation);
 		}
 		dump_histogram(mg->mg_histogram,
 		    ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 	}
 
 	(void) printf("\tpool %s\tfragmentation", spa_name(spa));
 	fragmentation = metaslab_class_fragmentation(mc);
 	if (fragmentation == ZFS_FRAG_INVALID)
 		(void) printf("\t%3s\n", "-");
 	else
 		(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
 	dump_histogram(mc->mc_histogram, ZFS_RANGE_TREE_HISTOGRAM_SIZE, 0);
 }
 
 static void
 print_vdev_indirect(vdev_t *vd)
 {
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib = vd->vdev_indirect_births;
 
 	if (vim == NULL) {
 		ASSERT0P(vib);
 		return;
 	}
 
 	ASSERT3U(vdev_indirect_mapping_object(vim), ==,
 	    vic->vic_mapping_object);
 	ASSERT3U(vdev_indirect_births_object(vib), ==,
 	    vic->vic_births_object);
 
 	(void) printf("indirect births obj %llu:\n",
 	    (longlong_t)vic->vic_births_object);
 	(void) printf("    vib_count = %llu\n",
 	    (longlong_t)vdev_indirect_births_count(vib));
 	for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) {
 		vdev_indirect_birth_entry_phys_t *cur_vibe =
 		    &vib->vib_entries[i];
 		(void) printf("\toffset %llx -> txg %llu\n",
 		    (longlong_t)cur_vibe->vibe_offset,
 		    (longlong_t)cur_vibe->vibe_phys_birth_txg);
 	}
 	(void) printf("\n");
 
 	(void) printf("indirect mapping obj %llu:\n",
 	    (longlong_t)vic->vic_mapping_object);
 	(void) printf("    vim_max_offset = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_max_offset(vim));
 	(void) printf("    vim_bytes_mapped = 0x%llx\n",
 	    (longlong_t)vdev_indirect_mapping_bytes_mapped(vim));
 	(void) printf("    vim_count = %llu\n",
 	    (longlong_t)vdev_indirect_mapping_num_entries(vim));
 
 	if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3)
 		return;
 
 	uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		(void) printf("\t<%llx:%llx:%llx> -> "
 		    "<%llx:%llx:%llx> (%x obsolete)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst),
 		    (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 		    counts[i]);
 	}
 	(void) printf("\n");
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		(void) printf("obsolete space map object %llu:\n",
 		    (u_longlong_t)obsolete_sm_object);
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==,
 		    obsolete_sm_object);
 		dump_spacemap(mos, vd->vdev_obsolete_sm);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
 	vdev_t *vd, *rvd = spa->spa_root_vdev;
 	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
 	if (zopt_metaslab_args > 0) {
 		c = zopt_metaslab[0];
 
 		if (c >= children)
 			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
 		if (zopt_metaslab_args > 1) {
 			vd = rvd->vdev_child[c];
 			print_vdev_metaslab_header(vd);
 
 			for (m = 1; m < zopt_metaslab_args; m++) {
 				if (zopt_metaslab[m] < vd->vdev_ms_count)
 					dump_metaslab(
 					    vd->vdev_ms[zopt_metaslab[m]]);
 				else
 					(void) fprintf(stderr, "bad metaslab "
 					    "number %llu\n",
 					    (u_longlong_t)zopt_metaslab[m]);
 			}
 			(void) printf("\n");
 			return;
 		}
 		children = c + 1;
 	}
 	for (; c < children; c++) {
 		vd = rvd->vdev_child[c];
 		print_vdev_metaslab_header(vd);
 
 		print_vdev_indirect(vd);
 
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
 static void
 dump_log_spacemaps(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	(void) printf("\nLog Space Maps in Pool:\n");
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) {
 		space_map_t *sm = NULL;
 		VERIFY0(space_map_open(&sm, spa_meta_objset(spa),
 		    sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT));
 
 		(void) printf("Log Spacemap object %llu txg %llu\n",
 		    (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg);
 		dump_spacemap(spa->spa_meta_objset, sm);
 		space_map_close(sm);
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_ddt_entry(const ddt_t *ddt, const ddt_lightweight_entry_t *ddlwe,
     uint64_t index)
 {
 	const ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	char blkbuf[BP_SPRINTF_LEN];
 	blkptr_t blk;
 	int p;
 
 	for (p = 0; p < DDT_NPHYS(ddt); p++) {
 		const ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		if (ddt_phys_birth(ddp, v) == 0)
 			continue;
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk);
 		(void) printf("index %llx refcnt %llu phys %d %s\n",
 		    (u_longlong_t)index, (u_longlong_t)ddt_phys_refcnt(ddp, v),
 		    p, blkbuf);
 	}
 }
 
 static void
 dump_dedup_ratio(const ddt_stat_t *dds)
 {
 	double rL, rP, rD, D, dedup, compress, copies;
 
 	if (dds->dds_blocks == 0)
 		return;
 
 	rL = (double)dds->dds_ref_lsize;
 	rP = (double)dds->dds_ref_psize;
 	rD = (double)dds->dds_ref_dsize;
 	D = (double)dds->dds_dsize;
 
 	dedup = rD / D;
 	compress = rL / rP;
 	copies = rD / rP;
 
 	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
 	    "dedup * compress / copies = %.2f\n\n",
 	    dedup, compress, copies, dedup * compress / copies);
 }
 
 static void
 dump_ddt_log(ddt_t *ddt)
 {
 	if (ddt->ddt_version != DDT_VERSION_FDT ||
 	    !(ddt->ddt_flags & DDT_FLAG_LOG))
 		return;
 
 	for (int n = 0; n < 2; n++) {
 		ddt_log_t *ddl = &ddt->ddt_log[n];
 
 		char flagstr[64] = {0};
 		if (ddl->ddl_flags > 0) {
 			flagstr[0] = ' ';
 			int c = 1;
 			if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
 				c += strlcpy(&flagstr[c], " FLUSHING",
 				    sizeof (flagstr) - c);
 			if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT)
 				c += strlcpy(&flagstr[c], " CHECKPOINT",
 				    sizeof (flagstr) - c);
 			if (ddl->ddl_flags &
 			    ~(DDL_FLAG_FLUSHING|DDL_FLAG_CHECKPOINT))
 				c += strlcpy(&flagstr[c], " UNKNOWN",
 				    sizeof (flagstr) - c);
 			flagstr[1] = '[';
 			flagstr[c] = ']';
 		}
 
 		uint64_t count = avl_numnodes(&ddl->ddl_tree);
 
 		printf(DMU_POOL_DDT_LOG ": flags=0x%02x%s; obj=%llu; "
 		    "len=%llu; txg=%llu; entries=%llu\n",
 		    zio_checksum_table[ddt->ddt_checksum].ci_name, n,
 		    ddl->ddl_flags, flagstr,
 		    (u_longlong_t)ddl->ddl_object,
 		    (u_longlong_t)ddl->ddl_length,
 		    (u_longlong_t)ddl->ddl_first_txg, (u_longlong_t)count);
 
 		if (ddl->ddl_flags & DDL_FLAG_CHECKPOINT) {
 			const ddt_key_t *ddk = &ddl->ddl_checkpoint;
 			printf("    checkpoint: "
 			    "%016llx:%016llx:%016llx:%016llx:%016llx\n",
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[0],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[1],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[2],
 			    (u_longlong_t)ddk->ddk_cksum.zc_word[3],
 			    (u_longlong_t)ddk->ddk_prop);
 		}
 
 		if (count == 0 || dump_opt['D'] < 4)
 			continue;
 
 		ddt_lightweight_entry_t ddlwe;
 		uint64_t index = 0;
 		for (ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
 		    ddle; ddle = AVL_NEXT(&ddl->ddl_tree, ddle)) {
 			DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
 			dump_ddt_entry(ddt, &ddlwe, index++);
 		}
 	}
 }
 
 static void
 dump_ddt_object(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	char name[DDT_NAMELEN];
 	ddt_lightweight_entry_t ddlwe;
 	uint64_t walk = 0;
 	dmu_object_info_t doi;
 	uint64_t count, dspace, mspace;
 	int error;
 
 	error = ddt_object_info(ddt, type, class, &doi);
 
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	ASSERT0(error);
 	if (count == 0)
 		return;
 
 	dspace = doi.doi_physical_blocks_512 << 9;
 	mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	ddt_object_name(ddt, type, class, name);
 
 	(void) printf("%s: dspace=%llu; mspace=%llu; entries=%llu\n", name,
 	    (u_longlong_t)dspace, (u_longlong_t)mspace, (u_longlong_t)count);
 
 	if (dump_opt['D'] < 3)
 		return;
 
 	(void) printf("%s: object=%llu\n", name,
 	    (u_longlong_t)ddt->ddt_object[type][class]);
 	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
 
 	if (dump_opt['D'] < 4)
 		return;
 
 	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
 		return;
 
 	(void) printf("%s contents:\n\n", name);
 
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &ddlwe)) == 0)
 		dump_ddt_entry(ddt, &ddlwe, walk);
 
 	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
 
 static void
 dump_ddt(ddt_t *ddt)
 {
 	if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 		return;
 
 	char flagstr[64] = {0};
 	if (ddt->ddt_flags > 0) {
 		flagstr[0] = ' ';
 		int c = 1;
 		if (ddt->ddt_flags & DDT_FLAG_FLAT)
 			c += strlcpy(&flagstr[c], " FLAT",
 			    sizeof (flagstr) - c);
 		if (ddt->ddt_flags & DDT_FLAG_LOG)
 			c += strlcpy(&flagstr[c], " LOG",
 			    sizeof (flagstr) - c);
 		if (ddt->ddt_flags & ~DDT_FLAG_MASK)
 			c += strlcpy(&flagstr[c], " UNKNOWN",
 			    sizeof (flagstr) - c);
 		flagstr[1] = '[';
 		flagstr[c] = ']';
 	}
 
 	printf("DDT-%s: version=%llu [%s]; flags=0x%02llx%s; rootobj=%llu\n",
 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
 	    (u_longlong_t)ddt->ddt_version,
 	    (ddt->ddt_version == 0) ? "LEGACY" :
 	    (ddt->ddt_version == 1) ? "FDT" : "UNKNOWN",
 	    (u_longlong_t)ddt->ddt_flags, flagstr,
 	    (u_longlong_t)ddt->ddt_dir_object);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++)
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++)
 			dump_ddt_object(ddt, type, class);
 
 	dump_ddt_log(ddt);
 }
 
 static void
 dump_all_ddts(spa_t *spa)
 {
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
 		dump_ddt(spa->spa_ddt[c]);
 
 	ddt_get_dedup_stats(spa, &dds_total);
 
 	if (dds_total.dds_blocks == 0) {
 		(void) printf("All DDTs are empty\n");
 		return;
 	}
 
 	(void) printf("\n");
 
 	if (dump_opt['D'] > 1) {
 		(void) printf("DDT histogram (aggregated over all DDTs):\n");
 		ddt_get_dedup_histogram(spa, &ddh_total);
 		zpool_dump_ddt(&dds_total, &ddh_total);
 	}
 
 	dump_dedup_ratio(&dds_total);
 
 	/*
 	 * Dump a histogram of unique class entry age
 	 */
 	if (dump_opt['D'] == 3 && getenv("ZDB_DDT_UNIQUE_AGE_HIST") != NULL) {
 		ddt_age_histo_t histogram;
 
 		(void) printf("DDT walk unique, building age histogram...\n");
 		ddt_prune_walk(spa, 0, &histogram);
 
 		/*
 		 * print out histogram for unique entry class birth
 		 */
 		if (histogram.dah_entries > 0) {
 			(void) printf("%5s  %9s  %4s\n",
 			    "age", "blocks", "amnt");
 			(void) printf("%5s  %9s  %4s\n",
 			    "-----", "---------", "----");
 			for (int i = 0; i < HIST_BINS; i++) {
 				(void) printf("%5d  %9d %4d%%\n", 1 << i,
 				    (int)histogram.dah_age_histo[i],
 				    (int)((histogram.dah_age_histo[i] * 100) /
 				    histogram.dah_entries));
 			}
 		}
 	}
 }
 
 static void
 dump_brt(spa_t *spa)
 {
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: unsupported on this pool\n");
 		return;
 	}
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		printf("BRT: empty\n");
 		return;
 	}
 
 	char count[32], used[32], saved[32];
 	zdb_nicebytes(brt_get_used(spa), used, sizeof (used));
 	zdb_nicebytes(brt_get_saved(spa), saved, sizeof (saved));
 	uint64_t ratio = brt_get_ratio(spa);
 	printf("BRT: used %s; saved %s; ratio %llu.%02llux\n", used, saved,
 	    (u_longlong_t)(ratio / 100), (u_longlong_t)(ratio % 100));
 
 	if (dump_opt['T'] < 2)
 		return;
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (!brtvd->bv_initiated) {
 			printf("BRT: vdev %" PRIu64 ": empty\n", vdevid);
 			continue;
 		}
 
 		zdb_nicenum(brtvd->bv_totalcount, count, sizeof (count));
 		zdb_nicebytes(brtvd->bv_usedspace, used, sizeof (used));
 		zdb_nicebytes(brtvd->bv_savedspace, saved, sizeof (saved));
 		printf("BRT: vdev %" PRIu64 ": refcnt %s; used %s; saved %s\n",
 		    vdevid, count, used, saved);
 	}
 
 	if (dump_opt['T'] < 3)
 		return;
 
 	/* -TTT shows a per-vdev histograms; -TTTT shows all entries */
 	boolean_t do_histo = dump_opt['T'] == 3;
 
 	char dva[64];
 
 	if (!do_histo)
 		printf("\n%-16s %-10s\n", "DVA", "REFCNT");
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (!brtvd->bv_initiated)
 			continue;
 
 		uint64_t counts[64] = {};
 
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    brtvd->bv_mos_entries);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t refcnt;
 			VERIFY0(zap_lookup_uint64(spa->spa_meta_objset,
 			    brtvd->bv_mos_entries,
 			    (const uint64_t *)za->za_name, 1,
 			    za->za_integer_length, za->za_num_integers,
 			    &refcnt));
 
 			if (do_histo)
 				counts[highbit64(refcnt)]++;
 			else {
 				uint64_t offset =
 				    *(const uint64_t *)za->za_name;
 
 				snprintf(dva, sizeof (dva), "%" PRIu64 ":%llx",
 				    vdevid, (u_longlong_t)offset);
 				printf("%-16s %-10llu\n", dva,
 				    (u_longlong_t)refcnt);
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		if (do_histo) {
 			printf("\nBRT: vdev %" PRIu64
 			    ": DVAs with 2^n refcnts:\n", vdevid);
 			dump_histogram(counts, 64, 0);
 		}
 	}
 }
 
 static void
 dump_dtl_seg(void *arg, uint64_t start, uint64_t size)
 {
 	char *prefix = arg;
 
 	(void) printf("%s [%llu,%llu) length %llu\n",
 	    prefix,
 	    (u_longlong_t)start,
 	    (u_longlong_t)(start + size),
 	    (u_longlong_t)(size));
 }
 
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t required;
 	const char *name[DTL_TYPES] = { "missing", "partial", "scrub",
 		"outage" };
 	char prefix[256];
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 	required = vdev_dtl_required(vd);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
 	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
 	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
 	    required ? "DTL-required" : "DTL-expendable");
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		zfs_range_tree_t *rt = vd->vdev_dtl[t];
 		if (zfs_range_tree_space(rt) == 0)
 			continue;
 		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
 		    indent + 2, "", name[t]);
 		zfs_range_tree_walk(rt, dump_dtl_seg, prefix);
 		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
 			dump_spacemap(spa->spa_meta_objset,
 			    vd->vdev_dtl_sm);
 	}
 
 	for (unsigned c = 0; c < vd->vdev_children; c++)
 		dump_dtl(vd->vdev_child[c], indent + 4);
 }
 
 static void
 dump_history(spa_t *spa)
 {
 	nvlist_t **events = NULL;
 	char *buf;
 	uint64_t resid, len, off = 0;
 	uint_t num = 0;
 	int error;
 	char tbuf[30];
 
 	if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) {
 		(void) fprintf(stderr, "%s: unable to allocate I/O buffer\n",
 		    __func__);
 		return;
 	}
 
 	do {
 		len = SPA_OLD_MAXBLOCKSIZE;
 
 		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
 			(void) fprintf(stderr, "Unable to read history: "
 			    "error %d\n", error);
 			free(buf);
 			return;
 		}
 
 		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
 			break;
 
 		off -= resid;
 	} while (len != 0);
 
 	(void) printf("\nHistory:\n");
 	for (unsigned i = 0; i < num; i++) {
 		boolean_t printed = B_FALSE;
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_TIME)) {
 			time_t tsec;
 			struct tm t;
 
 			tsec = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TIME);
 			(void) localtime_r(&tsec, &t);
 			(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
 		} else {
 			tbuf[0] = '\0';
 		}
 
 		if (nvlist_exists(events[i], ZPOOL_HIST_CMD)) {
 			(void) printf("%s %s\n", tbuf,
 			    fnvlist_lookup_string(events[i], ZPOOL_HIST_CMD));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_EVENT)) {
 			uint64_t ievent;
 
 			ievent = fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_INT_EVENT);
 			if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS)
 				goto next;
 
 			(void) printf(" %s [internal %s txg:%ju] %s\n",
 			    tbuf,
 			    zfs_history_event_names[ievent],
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_INT_NAME)) {
 			(void) printf("%s [txg:%ju] %s", tbuf,
 			    fnvlist_lookup_uint64(events[i],
 			    ZPOOL_HIST_TXG),
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_NAME));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_DSNAME)) {
 				(void) printf(" %s (%llu)",
 				    fnvlist_lookup_string(events[i],
 				    ZPOOL_HIST_DSNAME),
 				    (u_longlong_t)fnvlist_lookup_uint64(
 				    events[i],
 				    ZPOOL_HIST_DSID));
 			}
 
 			(void) printf(" %s\n", fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_INT_STR));
 		} else if (nvlist_exists(events[i], ZPOOL_HIST_IOCTL)) {
 			(void) printf("%s ioctl %s\n", tbuf,
 			    fnvlist_lookup_string(events[i],
 			    ZPOOL_HIST_IOCTL));
 
 			if (nvlist_exists(events[i], ZPOOL_HIST_INPUT_NVL)) {
 				(void) printf("    input:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_INPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_OUTPUT_NVL)) {
 				(void) printf("    output:\n");
 				dump_nvlist(fnvlist_lookup_nvlist(events[i],
 				    ZPOOL_HIST_OUTPUT_NVL), 8);
 			}
 			if (nvlist_exists(events[i], ZPOOL_HIST_ERRNO)) {
 				(void) printf("    errno: %lld\n",
 				    (longlong_t)fnvlist_lookup_int64(events[i],
 				    ZPOOL_HIST_ERRNO));
 			}
 		} else {
 			goto next;
 		}
 
 		printed = B_TRUE;
 next:
 		if (dump_opt['h'] > 1) {
 			if (!printed)
 				(void) printf("unrecognized record:\n");
 			dump_nvlist(events[i], 2);
 		}
 	}
 	free(buf);
 }
 
 static void
 dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static uint64_t
 blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp,
     const zbookmark_phys_t *zb)
 {
 	if (dnp == NULL) {
 		ASSERT(zb->zb_level < 0);
 		if (zb->zb_object == 0)
 			return (zb->zb_blkid);
 		return (zb->zb_blkid * BP_GET_LSIZE(bp));
 	}
 
 	ASSERT(zb->zb_level >= 0);
 
 	return ((zb->zb_blkid <<
 	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
 snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
     const blkptr_t *bp)
 {
 	static abd_t *pabd = NULL;
 	void *buf;
 	zio_t *zio;
 	zfs_zstdhdr_t zstd_hdr;
 	int error;
 
 	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD)
 		return;
 
 	if (BP_IS_HOLE(bp))
 		return;
 
 	if (BP_IS_EMBEDDED(bp)) {
 		buf = malloc(SPA_MAXBLOCKSIZE);
 		if (buf == NULL) {
 			(void) fprintf(stderr, "out of memory\n");
 			zdb_exit(1);
 		}
 		decode_embedded_bp_compressed(bp, buf);
 		memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 		free(buf);
 		zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 		zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
 		    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 		    zfs_get_hdrlevel(&zstd_hdr));
 		return;
 	}
 
 	if (!pabd)
 		pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	zio = zio_root(spa, NULL, NULL, 0);
 
 	/* Decrypt but don't decompress so we can read the compression header */
 	zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL,
 	    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS,
 	    NULL));
 	error = zio_wait(zio);
 	if (error) {
 		(void) fprintf(stderr, "read failed: %d\n", error);
 		return;
 	}
 	buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp));
 	memcpy(&zstd_hdr, buf, sizeof (zstd_hdr));
 	zstd_hdr.c_len = BE_32(zstd_hdr.c_len);
 	zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level);
 
 	(void) snprintf(blkbuf + strlen(blkbuf),
 	    buflen - strlen(blkbuf),
 	    " ZSTD:size=%u:version=%u:level=%u:NORMAL",
 	    zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
 	    zfs_get_hdrlevel(&zstd_hdr));
 
 	abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
 }
 
 static void
 snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp,
     boolean_t bp_freed)
 {
 	const dva_t *dva = bp->blk_dva;
 	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
 	int i;
 
 	if (dump_opt['b'] >= 6) {
 		snprintf_blkptr(blkbuf, buflen, bp);
 		if (bp_freed) {
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		}
 		return;
 	}
 
 	if (BP_IS_EMBEDDED(bp)) {
 		(void) sprintf(blkbuf,
 		    "EMBEDDED et=%u %llxL/%llxP B=%llu",
 		    (int)BPE_GET_ETYPE(bp),
 		    (u_longlong_t)BPE_GET_LSIZE(bp),
 		    (u_longlong_t)BPE_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 		return;
 	}
 
 	blkbuf[0] = '\0';
 
 	for (i = 0; i < ndvas; i++) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf), "%llu:%llx:%llx%s ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]),
 		    (DVA_GET_GANG(&dva[i]) ? "G" : ""));
 	}
 
 	if (BP_IS_HOLE(bp)) {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL B=%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp));
 	} else {
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    "%llxL/%llxP F=%llu B=%llu/%llu",
 		    (u_longlong_t)BP_GET_LSIZE(bp),
 		    (u_longlong_t)BP_GET_PSIZE(bp),
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    (u_longlong_t)BP_GET_LOGICAL_BIRTH(bp),
 		    (u_longlong_t)BP_GET_PHYSICAL_BIRTH(bp));
 		if (bp_freed)
 			(void) snprintf(blkbuf + strlen(blkbuf),
 			    buflen - strlen(blkbuf), " %s", "FREE");
 		(void) snprintf(blkbuf + strlen(blkbuf),
 		    buflen - strlen(blkbuf),
 		    " cksum=%016llx:%016llx:%016llx:%016llx",
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 	}
 }
 
 static u_longlong_t
 print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb,
     const dnode_phys_t *dnp)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 	u_longlong_t offset;
 	int l;
 
 	offset = (u_longlong_t)blkid2offset(dnp, bp, zb);
 
 	(void) printf("%16llx ", offset);
 
 	ASSERT(zb->zb_level >= 0);
 
 	for (l = dnp->dn_nlevels - 1; l >= -1; l--) {
 		if (l == zb->zb_level) {
 			(void) printf("L%llx", (u_longlong_t)zb->zb_level);
 		} else {
 			(void) printf(" ");
 		}
 	}
 
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE);
 	if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD)
 		snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s", blkbuf);
 
 	if (!BP_IS_EMBEDDED(bp)) {
 		if (BP_GET_TYPE(bp) != dnp->dn_type) {
 			(void) printf(" (ERROR: Block pointer type "
 			    "(%llu) does not match dnode type (%hhu))",
 			    BP_GET_TYPE(bp), dnp->dn_type);
 			corruption_found = B_TRUE;
 		}
 		if (BP_GET_LEVEL(bp) != zb->zb_level) {
 			(void) printf(" (ERROR: Block pointer level "
 			    "(%llu) does not match bookmark level (%lld))",
-			    BP_GET_LEVEL(bp), (u_longlong_t)zb->zb_level);
+			    BP_GET_LEVEL(bp), (longlong_t)zb->zb_level);
 			corruption_found = B_TRUE;
 		}
 	}
 	(void) printf("\n");
 
 	return (offset);
 }
 
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_phys_t *zb)
 {
 	u_longlong_t offset;
 	int err = 0;
 
 	if (BP_GET_BIRTH(bp) == 0)
 		return (0);
 
 	offset = print_indirect(spa, bp, zb, dnp);
 
 	if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) {
 		arc_flags_t flags = ARC_FLAG_WAIT;
 		int i;
 		blkptr_t *cbp;
 		int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 		arc_buf_t *buf;
 		uint64_t fill = 0;
 		ASSERT(!BP_IS_REDACTED(bp));
 
 		err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf,
 		    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 		if (err)
 			return (err);
 		ASSERT(buf->b_data);
 
 		/* recursively visit blocks below this */
 		cbp = buf->b_data;
 		for (i = 0; i < epb; i++, cbp++) {
 			zbookmark_phys_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = visit_indirect(spa, dnp, cbp, &czb);
 			if (err)
 				break;
 			fill += BP_GET_FILL(cbp);
 		}
 		if (!err) {
 			if (fill != BP_GET_FILL(bp)) {
 				(void) printf("%16llx: Block pointer "
 				    "fill (%llu) does not match calculated "
 				    "value (%llu)\n", offset, BP_GET_FILL(bp),
 				    (u_longlong_t)fill);
 				corruption_found = B_TRUE;
 			}
 		}
 		arc_buf_destroy(buf, &buf);
 	}
 
 	return (err);
 }
 
 static void
 dump_indirect(dnode_t *dn)
 {
 	dnode_phys_t *dnp = dn->dn_phys;
 	zbookmark_phys_t czb;
 
 	(void) printf("Indirect blocks:\n");
 
 	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (int j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
 		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
 	(void) printf("\n");
 }
 
 static void
 dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dir_phys_t *dd = data;
 	time_t crtime;
 	char nice[32];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (nice) >= NN_NUMBUF_SZ, "nice truncated");
 
 	if (dd == NULL)
 		return;
 
 	ASSERT3U(size, >=, sizeof (dsl_dir_phys_t));
 
 	crtime = dd->dd_creation_time;
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\thead_dataset_obj = %llu\n",
 	    (u_longlong_t)dd->dd_head_dataset_obj);
 	(void) printf("\t\tparent_dir_obj = %llu\n",
 	    (u_longlong_t)dd->dd_parent_obj);
 	(void) printf("\t\torigin_obj = %llu\n",
 	    (u_longlong_t)dd->dd_origin_obj);
 	(void) printf("\t\tchild_dir_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_child_dir_zapobj);
 	zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice));
 	(void) printf("\t\tused_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tcompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice));
 	(void) printf("\t\tuncompressed_bytes = %s\n", nice);
 	zdb_nicenum(dd->dd_quota, nice, sizeof (nice));
 	(void) printf("\t\tquota = %s\n", nice);
 	zdb_nicenum(dd->dd_reserved, nice, sizeof (nice));
 	(void) printf("\t\treserved = %s\n", nice);
 	(void) printf("\t\tprops_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_props_zapobj);
 	(void) printf("\t\tdeleg_zapobj = %llu\n",
 	    (u_longlong_t)dd->dd_deleg_zapobj);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)dd->dd_flags);
 
 #define	DO(which) \
 	zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \
 	    sizeof (nice)); \
 	(void) printf("\t\tused_breakdown[" #which "] = %s\n", nice)
 	DO(HEAD);
 	DO(SNAP);
 	DO(CHILD);
 	DO(CHILD_RSRV);
 	DO(REFRSRV);
 #undef DO
 	(void) printf("\t\tclones = %llu\n",
 	    (u_longlong_t)dd->dd_clones);
 }
 
 static void
 dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object;
 	dsl_dataset_phys_t *ds = data;
 	time_t crtime;
 	char used[32], compressed[32], uncompressed[32], unique[32];
 	char blkbuf[BP_SPRINTF_LEN];
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (used) >= NN_NUMBUF_SZ, "used truncated");
 	_Static_assert(sizeof (compressed) >= NN_NUMBUF_SZ,
 	    "compressed truncated");
 	_Static_assert(sizeof (uncompressed) >= NN_NUMBUF_SZ,
 	    "uncompressed truncated");
 	_Static_assert(sizeof (unique) >= NN_NUMBUF_SZ, "unique truncated");
 
 	if (ds == NULL)
 		return;
 
 	ASSERT(size == sizeof (*ds));
 	crtime = ds->ds_creation_time;
 	zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used));
 	zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed));
 	zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed,
 	    sizeof (uncompressed));
 	zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique));
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
 	(void) printf("\t\tprev_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_obj);
 	(void) printf("\t\tprev_snap_txg = %llu\n",
 	    (u_longlong_t)ds->ds_prev_snap_txg);
 	(void) printf("\t\tnext_snap_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_snap_obj);
 	(void) printf("\t\tsnapnames_zapobj = %llu\n",
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
 	(void) printf("\t\tuserrefs_obj = %llu\n",
 	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
 	(void) printf("\t\tdeadlist_obj = %llu\n",
 	    (u_longlong_t)ds->ds_deadlist_obj);
 	(void) printf("\t\tused_bytes = %s\n", used);
 	(void) printf("\t\tcompressed_bytes = %s\n", compressed);
 	(void) printf("\t\tuncompressed_bytes = %s\n", uncompressed);
 	(void) printf("\t\tunique = %s\n", unique);
 	(void) printf("\t\tfsid_guid = %llu\n",
 	    (u_longlong_t)ds->ds_fsid_guid);
 	(void) printf("\t\tguid = %llu\n",
 	    (u_longlong_t)ds->ds_guid);
 	(void) printf("\t\tflags = %llx\n",
 	    (u_longlong_t)ds->ds_flags);
 	(void) printf("\t\tnext_clones_obj = %llu\n",
 	    (u_longlong_t)ds->ds_next_clones_obj);
 	(void) printf("\t\tprops_obj = %llu\n",
 	    (u_longlong_t)ds->ds_props_obj);
 	(void) printf("\t\tbp = %s\n", blkbuf);
 }
 
 static int
 dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (BP_GET_BIRTH(bp) != 0) {
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("\t%s\n", blkbuf);
 	}
 	return (0);
 }
 
 static void
 dump_bptree(objset_t *os, uint64_t obj, const char *name)
 {
 	char bytes[32];
 	bptree_phys_t *bt;
 	dmu_buf_t *db;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db));
 	bt = db->db_data;
 	zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes));
 	(void) printf("\n    %s: %llu datasets, %s\n",
 	    name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes);
 	dmu_buf_rele(db, FTAG);
 
 	if (dump_opt['d'] < 5)
 		return;
 
 	(void) printf("\n");
 
 	(void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL);
 }
 
 static int
 dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx)
 {
 	(void) arg, (void) tx;
 	char blkbuf[BP_SPRINTF_LEN];
 
 	ASSERT(BP_GET_BIRTH(bp) != 0);
 	snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed);
 	(void) printf("\t%s\n", blkbuf);
 	return (0);
 }
 
 static void
 dump_full_bpobj(bpobj_t *bpo, const char *name, int indent)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	uint64_t i;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes));
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp));
 		zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp));
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu freed, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		} else {
 			(void) printf("    %*s: object %llu, %llu local "
 			    "blkptrs, %llu subobjs in object %llu, "
 			    "%s (%s/%s comp)\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_subobjs,
 			    bytes, comp, uncomp);
 		}
 
 		for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				corruption_found = B_TRUE;
 				continue;
 			}
 			dump_full_bpobj(&subbpo, "subobj", indent + 1);
 			bpobj_close(&subbpo);
 		}
 	} else {
 		if (bpo->bpo_havefreed) {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%llu freed, %s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_freed,
 			    bytes);
 		} else {
 			(void) printf("    %*s: object %llu, %llu blkptrs, "
 			    "%s\n",
 			    indent * 8, name,
 			    (u_longlong_t)bpo->bpo_object,
 			    (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs,
 			    bytes);
 		}
 	}
 
 	if (dump_opt['d'] < 5)
 		return;
 
 
 	if (indent == 0) {
 		(void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL);
 		(void) printf("\n");
 	}
 }
 
 static int
 dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact,
     boolean_t print_list)
 {
 	int err = 0;
 	zfs_bookmark_phys_t prop;
 	objset_t *mos = dp->dp_spa->spa_meta_objset;
 	err = dsl_bookmark_lookup(dp, name, NULL, &prop);
 
 	if (err != 0) {
 		return (err);
 	}
 
 	(void) printf("\t#%s: ", strchr(name, '#') + 1);
 	(void) printf("{guid: %llx creation_txg: %llu creation_time: "
 	    "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid,
 	    (u_longlong_t)prop.zbm_creation_txg,
 	    (u_longlong_t)prop.zbm_creation_time,
 	    (u_longlong_t)prop.zbm_redaction_obj);
 
 	IMPLY(print_list, print_redact);
 	if (!print_redact || prop.zbm_redaction_obj == 0)
 		return (0);
 
 	redaction_list_t *rl;
 	VERIFY0(dsl_redaction_list_hold_obj(dp,
 	    prop.zbm_redaction_obj, FTAG, &rl));
 
 	redaction_list_phys_t *rlp = rl->rl_phys;
 	(void) printf("\tRedacted:\n\t\tProgress: ");
 	if (rlp->rlp_last_object != UINT64_MAX ||
 	    rlp->rlp_last_blkid != UINT64_MAX) {
 		(void) printf("%llu %llu (incomplete)\n",
 		    (u_longlong_t)rlp->rlp_last_object,
 		    (u_longlong_t)rlp->rlp_last_blkid);
 	} else {
 		(void) printf("complete\n");
 	}
 	(void) printf("\t\tSnapshots: [");
 	for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) {
 		if (i > 0)
 			(void) printf(", ");
 		(void) printf("%0llu",
 		    (u_longlong_t)rlp->rlp_snaps[i]);
 	}
 	(void) printf("]\n\t\tLength: %llu\n",
 	    (u_longlong_t)rlp->rlp_num_entries);
 
 	if (!print_list) {
 		dsl_redaction_list_rele(rl, FTAG);
 		return (0);
 	}
 
 	if (rlp->rlp_num_entries == 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		(void) printf("\t\tRedaction List: []\n\n");
 		return (0);
 	}
 
 	redact_block_phys_t *rbp_buf;
 	uint64_t size;
 	dmu_object_info_t doi;
 
 	VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi));
 	size = doi.doi_max_offset;
 	rbp_buf = kmem_alloc(size, KM_SLEEP);
 
 	err = dmu_read(mos, prop.zbm_redaction_obj, 0, size,
 	    rbp_buf, 0);
 	if (err != 0) {
 		dsl_redaction_list_rele(rl, FTAG);
 		kmem_free(rbp_buf, size);
 		return (err);
 	}
 
 	(void) printf("\t\tRedaction List: [{object: %llx, offset: "
 	    "%llx, blksz: %x, count: %llx}",
 	    (u_longlong_t)rbp_buf[0].rbp_object,
 	    (u_longlong_t)rbp_buf[0].rbp_blkid,
 	    (uint_t)(redact_block_get_size(&rbp_buf[0])),
 	    (u_longlong_t)redact_block_get_count(&rbp_buf[0]));
 
 	for (size_t i = 1; i < rlp->rlp_num_entries; i++) {
 		(void) printf(",\n\t\t{object: %llx, offset: %llx, "
 		    "blksz: %x, count: %llx}",
 		    (u_longlong_t)rbp_buf[i].rbp_object,
 		    (u_longlong_t)rbp_buf[i].rbp_blkid,
 		    (uint_t)(redact_block_get_size(&rbp_buf[i])),
 		    (u_longlong_t)redact_block_get_count(&rbp_buf[i]));
 	}
 	dsl_redaction_list_rele(rl, FTAG);
 	kmem_free(rbp_buf, size);
 	(void) printf("]\n\n");
 	return (0);
 }
 
 static void
 dump_bookmarks(objset_t *os, int verbosity)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *attrp;
 	dsl_dataset_t *ds = dmu_objset_ds(os);
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	objset_t *mos = os->os_spa->spa_meta_objset;
 	if (verbosity < 4)
 		return;
 	attrp = zap_attribute_alloc();
 	dsl_pool_config_enter(dp, FTAG);
 
 	for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    zap_cursor_advance(&zc)) {
 		char osname[ZFS_MAX_DATASET_NAME_LEN];
 		char buf[ZFS_MAX_DATASET_NAME_LEN];
 		int len;
 		dmu_objset_name(os, osname);
 		len = snprintf(buf, sizeof (buf), "%s#%s", osname,
 		    attrp->za_name);
 		VERIFY3S(len, <, ZFS_MAX_DATASET_NAME_LEN);
 		(void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6);
 	}
 	zap_cursor_fini(&zc);
 	dsl_pool_config_exit(dp, FTAG);
 	zap_attribute_free(attrp);
 }
 
 static void
 bpobj_count_refd(bpobj_t *bpo)
 {
 	mos_obj_refd(bpo->bpo_object);
 
 	if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) {
 		mos_obj_refd(bpo->bpo_phys->bpo_subobjs);
 		for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) {
 			uint64_t subobj;
 			bpobj_t subbpo;
 			int error;
 			VERIFY0(dmu_read(bpo->bpo_os,
 			    bpo->bpo_phys->bpo_subobjs,
 			    i * sizeof (subobj), sizeof (subobj), &subobj, 0));
 			error = bpobj_open(&subbpo, bpo->bpo_os, subobj);
 			if (error != 0) {
 				(void) printf("ERROR %u while trying to open "
 				    "subobj id %llu\n",
 				    error, (u_longlong_t)subobj);
 				corruption_found = B_TRUE;
 				continue;
 			}
 			bpobj_count_refd(&subbpo);
 			bpobj_close(&subbpo);
 		}
 	}
 }
 
 static int
 dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle)
 {
 	spa_t *spa = arg;
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 	if (dle->dle_bpobj.bpo_object != empty_bpobj)
 		bpobj_count_refd(&dle->dle_bpobj);
 	return (0);
 }
 
 static int
 dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle)
 {
 	ASSERT0P(arg);
 	if (dump_opt['d'] >= 5) {
 		char buf[128];
 		(void) snprintf(buf, sizeof (buf),
 		    "mintxg %llu -> obj %llu",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 
 		dump_full_bpobj(&dle->dle_bpobj, buf, 0);
 	} else {
 		(void) printf("mintxg %llu -> obj %llu\n",
 		    (longlong_t)dle->dle_mintxg,
 		    (longlong_t)dle->dle_bpobj.bpo_object);
 	}
 	return (0);
 }
 
 static void
 dump_blkptr_list(dsl_deadlist_t *dl, const char *name)
 {
 	char bytes[32];
 	char comp[32];
 	char uncomp[32];
 	char entries[32];
 	spa_t *spa = dmu_objset_spa(dl->dl_os);
 	uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj;
 
 	if (dl->dl_oldfmt) {
 		if (dl->dl_bpobj.bpo_object != empty_bpobj)
 			bpobj_count_refd(&dl->dl_bpobj);
 	} else {
 		mos_obj_refd(dl->dl_object);
 		dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa);
 	}
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (bytes) >= NN_NUMBUF_SZ, "bytes truncated");
 	_Static_assert(sizeof (comp) >= NN_NUMBUF_SZ, "comp truncated");
 	_Static_assert(sizeof (uncomp) >= NN_NUMBUF_SZ, "uncomp truncated");
 	_Static_assert(sizeof (entries) >= NN_NUMBUF_SZ, "entries truncated");
 
 	if (dump_opt['d'] < 3)
 		return;
 
 	if (dl->dl_oldfmt) {
 		dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0);
 		return;
 	}
 
 	zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes));
 	zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp));
 	zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp));
 	zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries));
 	(void) printf("\n    %s: %s (%s/%s comp), %s entries\n",
 	    name, bytes, comp, uncomp, entries);
 
 	if (dump_opt['d'] < 4)
 		return;
 
 	(void) putchar('\n');
 
 	dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL);
 }
 
 static int
 verify_dd_livelist(objset_t *os)
 {
 	uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp;
 	dsl_pool_t *dp = spa_get_dsl(os->os_spa);
 	dsl_dir_t  *dd = os->os_dsl_dataset->ds_dir;
 
 	ASSERT(!dmu_objset_is_snapshot(os));
 	if (!dsl_deadlist_is_open(&dd->dd_livelist))
 		return (0);
 
 	/* Iterate through the livelist to check for duplicates */
 	dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight,
 	    NULL);
 
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_deadlist_space(&dd->dd_livelist, &ll_used,
 	    &ll_comp, &ll_uncomp);
 
 	dsl_dataset_t *origin_ds;
 	ASSERT(dsl_pool_config_held(dp));
 	VERIFY0(dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds));
 	VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset,
 	    &used, &comp, &uncomp));
 	dsl_dataset_rele(origin_ds, FTAG);
 	dsl_pool_config_exit(dp, FTAG);
 	/*
 	 *  It's possible that the dataset's uncomp space is larger than the
 	 *  livelist's because livelists do not track embedded block pointers
 	 */
 	if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) {
 		char nice_used[32], nice_comp[32], nice_uncomp[32];
 		(void) printf("Discrepancy in space accounting:\n");
 		zdb_nicenum(used, nice_used, sizeof (nice_used));
 		zdb_nicenum(comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("dir: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		zdb_nicenum(ll_used, nice_used, sizeof (nice_used));
 		zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp));
 		zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp));
 		(void) printf("livelist: used %s, comp %s, uncomp %s\n",
 		    nice_used, nice_comp, nice_uncomp);
 		return (1);
 	}
 	return (0);
 }
 
 static char *key_material = NULL;
 
 static boolean_t
 zdb_derive_key(dsl_dir_t *dd, uint8_t *key_out)
 {
 	uint64_t keyformat, salt, iters;
 	int i;
 	unsigned char c;
 
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT), sizeof (uint64_t),
 	    1, &keyformat));
 
 	switch (keyformat) {
 	case ZFS_KEYFORMAT_HEX:
 		for (i = 0; i < WRAPPING_KEY_LEN * 2; i += 2) {
 			if (!isxdigit(key_material[i]) ||
 			    !isxdigit(key_material[i+1]))
 				return (B_FALSE);
 			if (sscanf(&key_material[i], "%02hhx", &c) != 1)
 				return (B_FALSE);
 			key_out[i / 2] = c;
 		}
 		break;
 
 	case ZFS_KEYFORMAT_PASSPHRASE:
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT),
 		    sizeof (uint64_t), 1, &salt));
 		VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset,
 		    dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS),
 		    sizeof (uint64_t), 1, &iters));
 
 		if (PKCS5_PBKDF2_HMAC_SHA1(key_material, strlen(key_material),
 		    ((uint8_t *)&salt), sizeof (uint64_t), iters,
 		    WRAPPING_KEY_LEN, key_out) != 1)
 			return (B_FALSE);
 
 		break;
 
 	default:
 		fatal("no support for key format %u\n",
 		    (unsigned int) keyformat);
 	}
 
 	return (B_TRUE);
 }
 
 static char encroot[ZFS_MAX_DATASET_NAME_LEN];
 static boolean_t key_loaded = B_FALSE;
 
 static void
 zdb_load_key(objset_t *os)
 {
 	dsl_pool_t *dp;
 	dsl_dir_t *dd, *rdd;
 	uint8_t key[WRAPPING_KEY_LEN];
 	uint64_t rddobj;
 	int err;
 
 	dp = spa_get_dsl(os->os_spa);
 	dd = os->os_dsl_dataset->ds_dir;
 
 	dsl_pool_config_enter(dp, FTAG);
 	VERIFY0(zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj,
 	    DSL_CRYPTO_KEY_ROOT_DDOBJ, sizeof (uint64_t), 1, &rddobj));
 	VERIFY0(dsl_dir_hold_obj(dd->dd_pool, rddobj, NULL, FTAG, &rdd));
 	dsl_dir_name(rdd, encroot);
 	dsl_dir_rele(rdd, FTAG);
 
 	if (!zdb_derive_key(dd, key))
 		fatal("couldn't derive encryption key");
 
 	dsl_pool_config_exit(dp, FTAG);
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_UNAVAILABLE);
 
 	dsl_crypto_params_t *dcp;
 	nvlist_t *crypto_args;
 
 	crypto_args = fnvlist_alloc();
 	fnvlist_add_uint8_array(crypto_args, "wkeydata",
 	    (uint8_t *)key, WRAPPING_KEY_LEN);
 	VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 	    NULL, crypto_args, &dcp));
 	err = spa_keystore_load_wkey(encroot, dcp, B_FALSE);
 
 	dsl_crypto_params_free(dcp, (err != 0));
 	fnvlist_free(crypto_args);
 
 	if (err != 0)
 		fatal(
 		    "couldn't load encryption key for %s: %s",
 		    encroot, err == ZFS_ERR_CRYPTO_NOTSUP ?
 		    "crypto params not supported" : strerror(err));
 
 	ASSERT3U(dsl_dataset_get_keystatus(dd), ==, ZFS_KEYSTATUS_AVAILABLE);
 
 	printf("Unlocked encryption root: %s\n", encroot);
 	key_loaded = B_TRUE;
 }
 
 static void
 zdb_unload_key(void)
 {
 	if (!key_loaded)
 		return;
 
 	VERIFY0(spa_keystore_unload_wkey(encroot));
 	key_loaded = B_FALSE;
 }
 
 static avl_tree_t idx_tree;
 static avl_tree_t domain_tree;
 static boolean_t fuid_table_loaded;
 static objset_t *sa_os = NULL;
 static sa_attr_type_t *sa_attr_table = NULL;
 
 static int
 open_objset(const char *path, const void *tag, objset_t **osp)
 {
 	int err;
 	uint64_t sa_attrs = 0;
 	uint64_t version = 0;
 
 	VERIFY0P(sa_os);
 
 	/*
 	 * We can't own an objset if it's redacted.  Therefore, we do this
 	 * dance: hold the objset, then acquire a long hold on its dataset, then
 	 * release the pool (which is held as part of holding the objset).
 	 */
 
 	if (dump_opt['K']) {
 		/* decryption requested, try to load keys */
 		err = dmu_objset_hold(path, tag, osp);
 		if (err != 0) {
 			(void) fprintf(stderr, "failed to hold dataset "
 			    "'%s': %s\n",
 			    path, strerror(err));
 			return (err);
 		}
 		dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 		dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 		/* succeeds or dies */
 		zdb_load_key(*osp);
 
 		/* release it all */
 		dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 		dsl_dataset_rele(dmu_objset_ds(*osp), tag);
 	}
 
 	int ds_hold_flags = key_loaded ? DS_HOLD_FLAG_DECRYPT : 0;
 
 	err = dmu_objset_hold_flags(path, ds_hold_flags, tag, osp);
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to hold dataset '%s': %s\n",
 		    path, strerror(err));
 		return (err);
 	}
 	dsl_dataset_long_hold(dmu_objset_ds(*osp), tag);
 	dsl_pool_rele(dmu_objset_pool(*osp), tag);
 
 	if (dmu_objset_type(*osp) == DMU_OST_ZFS &&
 	    (key_loaded || !(*osp)->os_encrypted)) {
 		(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 		    8, 1, &version);
 		if (version >= ZPL_VERSION_SA) {
 			(void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS,
 			    8, 1, &sa_attrs);
 		}
 		err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END,
 		    &sa_attr_table);
 		if (err != 0) {
 			(void) fprintf(stderr, "sa_setup failed: %s\n",
 			    strerror(err));
 			dsl_dataset_long_rele(dmu_objset_ds(*osp), tag);
 			dsl_dataset_rele_flags(dmu_objset_ds(*osp),
 			    ds_hold_flags, tag);
 			*osp = NULL;
 		}
 	}
 	sa_os = *osp;
 
 	return (err);
 }
 
 static void
 close_objset(objset_t *os, const void *tag)
 {
 	VERIFY3P(os, ==, sa_os);
 	if (os->os_sa != NULL)
 		sa_tear_down(os);
 	dsl_dataset_long_rele(dmu_objset_ds(os), tag);
 	dsl_dataset_rele_flags(dmu_objset_ds(os),
 	    key_loaded ? DS_HOLD_FLAG_DECRYPT : 0, tag);
 	sa_attr_table = NULL;
 	sa_os = NULL;
 
 	zdb_unload_key();
 }
 
 static void
 fuid_table_destroy(void)
 {
 	if (fuid_table_loaded) {
 		zfs_fuid_table_destroy(&idx_tree, &domain_tree);
 		fuid_table_loaded = B_FALSE;
 	}
 }
 
 /*
  * Clean up DDT internal state. ddt_lookup() adds entries to ddt_tree, which on
  * a live pool are normally cleaned up during ddt_sync(). We can't do that (and
  * wouldn't want to anyway), but if we don't clean up the presence of stuff on
  * ddt_tree will trip asserts in ddt_table_free(). So, we clean up ourselves.
  *
  * Note that this is not a particularly efficient way to do this, but
  * ddt_remove() is the only public method that can do the work we need, and it
  * requires the right locks and etc to do the job. This is only ever called
  * during zdb shutdown so efficiency is not especially important.
  */
 static void
 zdb_ddt_cleanup(spa_t *spa)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt)
 			continue;
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		ddt_enter(ddt);
 		ddt_entry_t *dde = avl_first(&ddt->ddt_tree), *next;
 		while (dde) {
 			next = AVL_NEXT(&ddt->ddt_tree, dde);
 			dde->dde_io = NULL;
 			ddt_remove(ddt, dde);
 			dde = next;
 		}
 		ddt_exit(ddt);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 }
 
 static void
 zdb_exit(int reason)
 {
 	if (spa != NULL)
 		zdb_ddt_cleanup(spa);
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	if (kernel_init_done)
 		kernel_fini();
 
 	exit(reason);
 }
 
 /*
  * print uid or gid information.
  * For normal POSIX id just the id is printed in decimal format.
  * For CIFS files with FUID the fuid is printed in hex followed by
  * the domain-rid string.
  */
 static void
 print_idstr(uint64_t id, const char *id_type)
 {
 	if (FUID_INDEX(id)) {
 		const char *domain =
 		    zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id));
 		(void) printf("\t%s     %llx [%s-%d]\n", id_type,
 		    (u_longlong_t)id, domain, (int)FUID_RID(id));
 	} else {
 		(void) printf("\t%s     %llu\n", id_type, (u_longlong_t)id);
 	}
 
 }
 
 static void
 dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid)
 {
 	uint32_t uid_idx, gid_idx;
 
 	uid_idx = FUID_INDEX(uid);
 	gid_idx = FUID_INDEX(gid);
 
 	/* Load domain table, if not already loaded */
 	if (!fuid_table_loaded && (uid_idx || gid_idx)) {
 		uint64_t fuid_obj;
 
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj));
 		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
 	}
 
 	print_idstr(uid, "uid");
 	print_idstr(gid, "gid");
 }
 
 static void
 dump_znode_sa_xattr(sa_handle_t *hdl)
 {
 	nvlist_t *sa_xattr;
 	nvpair_t *elem = NULL;
 	int sa_xattr_size = 0;
 	int sa_xattr_entries = 0;
 	int error;
 	char *sa_xattr_packed;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size);
 	if (error || sa_xattr_size == 0)
 		return;
 
 	sa_xattr_packed = malloc(sa_xattr_size);
 	if (sa_xattr_packed == NULL)
 		return;
 
 	error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR],
 	    sa_xattr_packed, sa_xattr_size);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0);
 	if (error) {
 		free(sa_xattr_packed);
 		return;
 	}
 
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL)
 		sa_xattr_entries++;
 
 	(void) printf("\tSA xattrs: %d bytes, %d entries\n\n",
 	    sa_xattr_size, sa_xattr_entries);
 	while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) {
 		boolean_t can_print = !dump_opt['P'];
 		uchar_t *value;
 		uint_t cnt, idx;
 
 		(void) printf("\t\t%s = ", nvpair_name(elem));
 		nvpair_value_byte_array(elem, &value, &cnt);
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (!isprint(value[idx])) {
 				can_print = B_FALSE;
 				break;
 			}
 		}
 
 		for (idx = 0; idx < cnt; ++idx) {
 			if (can_print)
 				(void) putchar(value[idx]);
 			else
 				(void) printf("\\%3.3o", value[idx]);
 		}
 		(void) putchar('\n');
 	}
 
 	nvlist_free(sa_xattr);
 	free(sa_xattr_packed);
 }
 
 static void
 dump_znode_symlink(sa_handle_t *hdl)
 {
 	int sa_symlink_size = 0;
 	char linktarget[MAXPATHLEN];
 	int error;
 
 	error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size);
 	if (error || sa_symlink_size == 0) {
 		return;
 	}
 	if (sa_symlink_size >= sizeof (linktarget)) {
 		(void) printf("symlink size %d is too large\n",
 		    sa_symlink_size);
 		return;
 	}
 	linktarget[sa_symlink_size] = '\0';
 	if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK],
 	    &linktarget, sa_symlink_size) == 0)
 		(void) printf("\ttarget	%s\n", linktarget);
 }
 
 static void
 dump_znode(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) data, (void) size;
 	char path[MAXPATHLEN * 2];	/* allow for xattr and failure prefix */
 	sa_handle_t *hdl;
 	uint64_t xattr, rdev, gen;
 	uint64_t uid, gid, mode, fsize, parent, links;
 	uint64_t pflags;
 	uint64_t acctm[2], modtm[2], chgtm[2], crtm[2];
 	time_t z_crtime, z_atime, z_mtime, z_ctime;
 	sa_bulk_attr_t bulk[12];
 	int idx = 0;
 	int error;
 
 	VERIFY3P(os, ==, sa_os);
 	if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL,
 	    &links, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL,
 	    &mode, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT],
 	    NULL, &parent, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL,
 	    &fsize, 8);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL,
 	    acctm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL,
 	    modtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL,
 	    crtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL,
 	    chgtm, 16);
 	SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL,
 	    &pflags, 8);
 
 	if (sa_bulk_lookup(hdl, bulk, idx)) {
 		(void) sa_handle_destroy(hdl);
 		return;
 	}
 
 	z_crtime = (time_t)crtm[0];
 	z_atime = (time_t)acctm[0];
 	z_mtime = (time_t)modtm[0];
 	z_ctime = (time_t)chgtm[0];
 
 	if (dump_opt['d'] > 4) {
 		error = zfs_obj_to_path(os, object, path, sizeof (path));
 		if (error == ESTALE) {
 			(void) snprintf(path, sizeof (path), "on delete queue");
 		} else if (error != 0) {
 			leaked_objects++;
 			(void) snprintf(path, sizeof (path),
 			    "path not found, possibly leaked");
 		}
 		(void) printf("\tpath	%s\n", path);
 	}
 
 	if (S_ISLNK(mode))
 		dump_znode_symlink(hdl);
 	dump_uidgid(os, uid, gid);
 	(void) printf("\tatime	%s", ctime(&z_atime));
 	(void) printf("\tmtime	%s", ctime(&z_mtime));
 	(void) printf("\tctime	%s", ctime(&z_ctime));
 	(void) printf("\tcrtime	%s", ctime(&z_crtime));
 	(void) printf("\tgen	%llu\n", (u_longlong_t)gen);
 	(void) printf("\tmode	%llo\n", (u_longlong_t)mode);
 	(void) printf("\tsize	%llu\n", (u_longlong_t)fsize);
 	(void) printf("\tparent	%llu\n", (u_longlong_t)parent);
 	(void) printf("\tlinks	%llu\n", (u_longlong_t)links);
 	(void) printf("\tpflags	%llx\n", (u_longlong_t)pflags);
 	if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) {
 		uint64_t projid;
 
 		if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid,
 		    sizeof (uint64_t)) == 0)
 			(void) printf("\tprojid	%llu\n", (u_longlong_t)projid);
 	}
 	if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\txattr	%llu\n", (u_longlong_t)xattr);
 	if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev,
 	    sizeof (uint64_t)) == 0)
 		(void) printf("\trdev	0x%016llx\n", (u_longlong_t)rdev);
 	dump_znode_sa_xattr(hdl);
 	sa_handle_destroy(hdl);
 }
 
 static void
 dump_acl(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static void
 dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 	(void) os, (void) object, (void) data, (void) size;
 }
 
 static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
 	dump_none,		/* packed nvlist		*/
 	dump_packed_nvlist,	/* packed nvlist size		*/
 	dump_none,		/* bpobj			*/
 	dump_bpobj,		/* bpobj header			*/
 	dump_none,		/* SPA space map header		*/
 	dump_none,		/* SPA space map		*/
 	dump_none,		/* ZIL intent log		*/
 	dump_dnode,		/* DMU dnode			*/
 	dump_dmu_objset,	/* DMU objset			*/
 	dump_dsl_dir,		/* DSL directory		*/
 	dump_zap,		/* DSL directory child map	*/
 	dump_zap,		/* DSL dataset snap map		*/
 	dump_zap,		/* DSL props			*/
 	dump_dsl_dataset,	/* DSL dataset			*/
 	dump_znode,		/* ZFS znode			*/
 	dump_acl,		/* ZFS V0 ACL			*/
 	dump_uint8,		/* ZFS plain file		*/
 	dump_zpldir,		/* ZFS directory		*/
 	dump_zap,		/* ZFS master node		*/
 	dump_zap,		/* ZFS delete queue		*/
 	dump_uint8,		/* zvol object			*/
 	dump_zap,		/* zvol prop			*/
 	dump_uint8,		/* other uint8[]		*/
 	dump_uint64,		/* other uint64[]		*/
 	dump_zap,		/* other ZAP			*/
 	dump_zap,		/* persistent error log		*/
 	dump_uint8,		/* SPA history			*/
 	dump_history_offsets,	/* SPA history offsets		*/
 	dump_zap,		/* Pool properties		*/
 	dump_zap,		/* DSL permissions		*/
 	dump_acl,		/* ZFS ACL			*/
 	dump_uint8,		/* ZFS SYSACL			*/
 	dump_none,		/* FUID nvlist			*/
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
 	dump_zap,		/* ZFS user/group/project used	*/
 	dump_zap,		/* ZFS user/group/project quota	*/
 	dump_zap,		/* snapshot refcount tags	*/
 	dump_ddt_zap,		/* DDT ZAP object		*/
 	dump_zap,		/* DDT statistics		*/
 	dump_znode,		/* SA object			*/
 	dump_zap,		/* SA Master Node		*/
 	dump_sa_attrs,		/* SA attribute registration	*/
 	dump_sa_layouts,	/* SA attribute layouts		*/
 	dump_zap,		/* DSL scrub translations	*/
 	dump_none,		/* fake dedup BP		*/
 	dump_zap,		/* deadlist			*/
 	dump_none,		/* deadlist hdr			*/
 	dump_zap,		/* dsl clones			*/
 	dump_bpobj_subobjs,	/* bpobj subobjs		*/
 	dump_unknown,		/* Unknown type, must be last	*/
 };
 
 static boolean_t
 match_object_type(dmu_object_type_t obj_type, uint64_t flags)
 {
 	boolean_t match = B_TRUE;
 
 	switch (obj_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (!(flags & ZOR_FLAG_DIRECTORY))
 			match = B_FALSE;
 		break;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (!(flags & ZOR_FLAG_PLAIN_FILE))
 			match = B_FALSE;
 		break;
 	case DMU_OT_SPACE_MAP:
 		if (!(flags & ZOR_FLAG_SPACE_MAP))
 			match = B_FALSE;
 		break;
 	default:
 		if (strcmp(zdb_ot_name(obj_type), "zap") == 0) {
 			if (!(flags & ZOR_FLAG_ZAP))
 				match = B_FALSE;
 			break;
 		}
 
 		/*
 		 * If all bits except some of the supported flags are
 		 * set, the user combined the all-types flag (A) with
 		 * a negated flag to exclude some types (e.g. A-f to
 		 * show all object types except plain files).
 		 */
 		if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES)
 			match = B_FALSE;
 
 		break;
 	}
 
 	return (match);
 }
 
 static void
 dump_object(objset_t *os, uint64_t object, int verbosity,
     boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags)
 {
 	dmu_buf_t *db = NULL;
 	dmu_object_info_t doi;
 	dnode_t *dn;
 	boolean_t dnode_held = B_FALSE;
 	void *bonus = NULL;
 	size_t bsize = 0;
 	char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32];
 	char bonus_size[32];
 	char aux[50];
 	int error;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (iblk) >= NN_NUMBUF_SZ, "iblk truncated");
 	_Static_assert(sizeof (dblk) >= NN_NUMBUF_SZ, "dblk truncated");
 	_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ, "lsize truncated");
 	_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ, "asize truncated");
 	_Static_assert(sizeof (bonus_size) >= NN_NUMBUF_SZ,
 	    "bonus_size truncated");
 
 	if (*print_header) {
 		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %6s  %5s  %6s  %s\n",
 		    "Object", "lvl", "iblk", "dblk", "dsize", "dnsize",
 		    "lsize", "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
 		dn = DMU_META_DNODE(os);
 		dmu_object_info_from_dnode(dn, &doi);
 	} else {
 		/*
 		 * Encrypted datasets will have sensitive bonus buffers
 		 * encrypted. Therefore we cannot hold the bonus buffer and
 		 * must hold the dnode itself instead.
 		 */
 		error = dmu_object_info(os, object, &doi);
 		if (error)
 			fatal("dmu_object_info() failed, errno %u", error);
 
 		if (!key_loaded && os->os_encrypted &&
 		    DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) {
 			error = dnode_hold(os, object, FTAG, &dn);
 			if (error)
 				fatal("dnode_hold() failed, errno %u", error);
 			dnode_held = B_TRUE;
 		} else {
 			error = dmu_bonus_hold(os, object, FTAG, &db);
 			if (error)
 				fatal("dmu_bonus_hold(%llu) failed, errno %u",
 				    object, error);
 			bonus = db->db_data;
 			bsize = db->db_size;
 			dn = DB_DNODE((dmu_buf_impl_t *)db);
 		}
 	}
 
 	/*
 	 * Default to showing all object types if no flags were specified.
 	 */
 	if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES &&
 	    !match_object_type(doi.doi_type, flags))
 		goto out;
 
 	if (dnode_slots_used)
 		*dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
 
 	zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk));
 	zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk));
 	zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize));
 	zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize));
 	zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size));
 	zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize));
 	(void) snprintf(fill, sizeof (fill), "%6.2f", 100.0 *
 	    doi.doi_fill_count * doi.doi_data_block_size / (object == 0 ?
 	    DNODES_PER_BLOCK : 1) / doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress == ZIO_COMPRESS_INHERIT &&
 	    ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) {
 		const char *compname = NULL;
 		if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION,
 		    ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel),
 		    &compname) == 0) {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux), " (Z=inherit=%s)",
 			    compname);
 		} else {
 			(void) snprintf(aux + strlen(aux),
 			    sizeof (aux) - strlen(aux),
 			    " (Z=inherit=%s-unknown)",
 			    ZDB_COMPRESS_NAME(os->os_compress));
 		}
 	} else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress));
 	} else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux),
 		    " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
 	(void) printf("%10lld  %3u  %5s  %5s  %5s  %6s  %5s  %6s  %s%s\n",
 	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
 	    asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
 		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %5s  %6s  %s\n",
 		    "", "", "", "", "", "", bonus_size, "bonus",
 		    zdb_ot_name(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
 		(void) printf("\tdnode flags: %s%s%s%s\n",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
 		    "USED_BYTES " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
 		    "USERUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ?
 		    "USEROBJUSED_ACCOUNTED " : "",
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ?
 		    "SPILL_BLKPTR" : "");
 		(void) printf("\tdnode maxblkid: %llu\n",
 		    (longlong_t)dn->dn_phys->dn_maxblkid);
 
 		if (!dnode_held) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os,
 			    object, bonus, bsize);
 		} else {
 			(void) printf("\t\t(bonus encrypted)\n");
 		}
 
 		if (key_loaded ||
 		    (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type))) {
 			object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object,
 			    NULL, 0);
 		} else {
 			(void) printf("\t\t(object encrypted)\n");
 		}
 
 		*print_header = B_TRUE;
 	}
 
 	if (verbosity >= 5) {
 		if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 			char blkbuf[BP_SPRINTF_LEN];
 			snprintf_blkptr_compact(blkbuf, sizeof (blkbuf),
 			    DN_SPILL_BLKPTR(dn->dn_phys), B_FALSE);
 			(void) printf("\nSpill block: %s\n", blkbuf);
 		}
 		dump_indirect(dn);
 	}
 
 	if (verbosity >= 5) {
 		/*
 		 * Report the list of segments that comprise the object.
 		 */
 		uint64_t start = 0;
 		uint64_t end;
 		uint64_t blkfill = 1;
 		int minlvl = 1;
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			minlvl = 0;
 			blkfill = DNODES_PER_BLOCK;
 		}
 
 		for (;;) {
 			char segsize[32];
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (segsize) >= NN_NUMBUF_SZ,
 			    "segsize truncated");
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
 				break;
 			end = start;
 			error = dnode_next_offset(dn,
 			    DNODE_FIND_HOLE, &end, minlvl, blkfill, 0);
 			zdb_nicenum(end - start, segsize, sizeof (segsize));
 			(void) printf("\t\tsegment [%016llx, %016llx)"
 			    " size %5s\n", (u_longlong_t)start,
 			    (u_longlong_t)end, segsize);
 			if (error)
 				break;
 			start = end;
 		}
 	}
 
 out:
 	if (db != NULL)
 		dmu_buf_rele(db, FTAG);
 	if (dnode_held)
 		dnode_rele(dn, FTAG);
 }
 
 static void
 count_dir_mos_objects(dsl_dir_t *dd)
 {
 	mos_obj_refd(dd->dd_object);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj);
 	mos_obj_refd(dsl_dir_phys(dd)->dd_clones);
 
 	/*
 	 * The dd_crypto_obj can be referenced by multiple dsl_dir's.
 	 * Ignore the references after the first one.
 	 */
 	mos_obj_refd_multiple(dd->dd_crypto_obj);
 }
 
 static void
 count_ds_mos_objects(dsl_dataset_t *ds)
 {
 	mos_obj_refd(ds->ds_object);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj);
 	mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj);
 	mos_obj_refd(ds->ds_bookmarks_obj);
 
 	if (!dsl_dataset_is_snapshot(ds)) {
 		count_dir_mos_objects(ds->ds_dir);
 	}
 }
 
 static const char *const objset_types[DMU_OST_NUMTYPES] = {
 	"NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" };
 
 /*
  * Parse a string denoting a range of object IDs of the form
  * <start>[:<end>[:flags]], and store the results in zor.
  * Return 0 on success. On error, return 1 and update the msg
  * pointer to point to a descriptive error message.
  */
 static int
 parse_object_range(char *range, zopt_object_range_t *zor, const char **msg)
 {
 	uint64_t flags = 0;
 	char *p, *s, *dup, *flagstr, *tmp = NULL;
 	size_t len;
 	int i;
 	int rc = 0;
 
 	if (strchr(range, ':') == NULL) {
 		zor->zor_obj_start = strtoull(range, &p, 0);
 		if (*p != '\0') {
 			*msg = "Invalid characters in object ID";
 			rc = 1;
 		}
 		zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 		zor->zor_obj_end = zor->zor_obj_start;
 		return (rc);
 	}
 
 	if (strchr(range, ':') == range) {
 		*msg = "Invalid leading colon";
 		rc = 1;
 		return (rc);
 	}
 
 	len = strlen(range);
 	if (range[len - 1] == ':') {
 		*msg = "Invalid trailing colon";
 		rc = 1;
 		return (rc);
 	}
 
 	dup = strdup(range);
 	s = strtok_r(dup, ":", &tmp);
 	zor->zor_obj_start = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in start object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	zor->zor_obj_end = strtoull(s, &p, 0);
 
 	if (*p != '\0') {
 		*msg = "Invalid characters in end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	if (zor->zor_obj_start > zor->zor_obj_end) {
 		*msg = "Start object ID may not exceed end object ID";
 		rc = 1;
 		goto out;
 	}
 
 	s = strtok_r(NULL, ":", &tmp);
 	if (s == NULL) {
 		zor->zor_flags = ZOR_FLAG_ALL_TYPES;
 		goto out;
 	} else if (strtok_r(NULL, ":", &tmp) != NULL) {
 		*msg = "Invalid colon-delimited field after flags";
 		rc = 1;
 		goto out;
 	}
 
 	flagstr = s;
 	for (i = 0; flagstr[i]; i++) {
 		int bit;
 		boolean_t negation = (flagstr[i] == '-');
 
 		if (negation) {
 			i++;
 			if (flagstr[i] == '\0') {
 				*msg = "Invalid trailing negation operator";
 				rc = 1;
 				goto out;
 			}
 		}
 		bit = flagbits[(uchar_t)flagstr[i]];
 		if (bit == 0) {
 			*msg = "Invalid flag";
 			rc = 1;
 			goto out;
 		}
 		if (negation)
 			flags &= ~bit;
 		else
 			flags |= bit;
 	}
 	zor->zor_flags = flags;
 
 	zor->zor_obj_start = ZDB_MAP_OBJECT_ID(zor->zor_obj_start);
 	zor->zor_obj_end = ZDB_MAP_OBJECT_ID(zor->zor_obj_end);
 
 out:
 	free(dup);
 	return (rc);
 }
 
 static void
 dump_objset(objset_t *os)
 {
 	dmu_objset_stats_t dds = { 0 };
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[32];
 	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[ZFS_MAX_DATASET_NAME_LEN];
 	const char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
 	boolean_t print_header;
 	unsigned i;
 	int error;
 	uint64_t total_slots_used = 0;
 	uint64_t max_slot_used = 0;
 	uint64_t dnode_slots;
 	uint64_t obj_start;
 	uint64_t obj_end;
 	uint64_t flags;
 
 	/* make sure nicenum has enough space */
 	_Static_assert(sizeof (numbuf) >= NN_NUMBUF_SZ, "numbuf truncated");
 
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	dmu_objset_fast_stat(os, &dds);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 
 	print_header = B_TRUE;
 
 	if (dds.dds_type < DMU_OST_NUMTYPES)
 		type = objset_types[dds.dds_type];
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
 		usedobjs = BP_GET_FILL(os->os_rootbp);
 		refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)->
 		    dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
 	ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp));
 
 	zdb_nicenum(refdbytes, numbuf, sizeof (numbuf));
 
 	if (verbosity >= 4) {
 		(void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp ");
 		(void) snprintf_blkptr(blkbuf + strlen(blkbuf),
 		    sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
 
 	dmu_objset_name(os, osname);
 
 	(void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, "
 	    "%s, %llu objects%s%s\n",
 	    osname, type, (u_longlong_t)dmu_objset_id(os),
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf,
 	    (dds.dds_inconsistent) ? " (inconsistent)" : "");
 
 	for (i = 0; i < zopt_object_args; i++) {
 		obj_start = zopt_object_ranges[i].zor_obj_start;
 		obj_end = zopt_object_ranges[i].zor_obj_end;
 		flags = zopt_object_ranges[i].zor_flags;
 
 		object = obj_start;
 		if (object == 0 || obj_start == obj_end)
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		else
 			object--;
 
 		while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) &&
 		    object <= obj_end) {
 			dump_object(os, object, verbosity, &print_header, NULL,
 			    flags);
 		}
 	}
 
 	if (zopt_object_args > 0) {
 		(void) printf("\n");
 		return;
 	}
 
 	if (dump_opt['i'] != 0 || verbosity >= 2)
 		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL) {
 		dsl_dataset_t *ds = dmu_objset_ds(os);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
 		    !dmu_objset_is_snapshot(os)) {
 			dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist");
 			if (verify_dd_livelist(os) != 0)
 				fatal("livelist is incorrect");
 		}
 
 		if (dsl_dataset_remap_deadlist_exists(ds)) {
 			(void) printf("ds_remap_deadlist:\n");
 			dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist");
 		}
 		count_ds_mos_objects(ds);
 	}
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_bookmarks(os, verbosity);
 
 	if (verbosity < 2)
 		return;
 
 	if (BP_IS_HOLE(os->os_rootbp))
 		return;
 
 	dump_object(os, 0, verbosity, &print_header, NULL, 0);
 	object_count = 0;
 	if (DMU_USERUSED_DNODE(os) != NULL &&
 	    DMU_USERUSED_DNODE(os)->dn_type != 0) {
 		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
 		    NULL, 0);
 	}
 
 	if (DMU_PROJECTUSED_DNODE(os) != NULL &&
 	    DMU_PROJECTUSED_DNODE(os)->dn_type != 0)
 		dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity,
 		    &print_header, NULL, 0);
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		dump_object(os, object, verbosity, &print_header, &dnode_slots,
 		    0);
 		object_count++;
 		total_slots_used += dnode_slots;
 		max_slot_used = object + dnode_slots - 1;
 	}
 
 	(void) printf("\n");
 
 	(void) printf("    Dnode slots:\n");
 	(void) printf("\tTotal used:    %10llu\n",
 	    (u_longlong_t)total_slots_used);
 	(void) printf("\tMax used:      %10llu\n",
 	    (u_longlong_t)max_slot_used);
 	(void) printf("\tPercent empty: %10lf\n",
 	    (double)(max_slot_used - total_slots_used)*100 /
 	    (double)max_slot_used);
 	(void) printf("\n");
 
 	if (error != ESRCH) {
 		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
 		abort();
 	}
 
 	ASSERT3U(object_count, ==, usedobjs);
 
 	if (leaked_objects != 0) {
 		(void) printf("%d potentially leaked objects detected\n",
 		    leaked_objects);
 		leaked_objects = 0;
 	}
 }
 
 static void
 dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
 	(void) printf("%s", header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
 	(void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum);
 	(void) printf("\ttimestamp = %llu UTC = %s",
 	    (u_longlong_t)ub->ub_timestamp, ctime(&timestamp));
 
 	char blkbuf[BP_SPRINTF_LEN];
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 	(void) printf("\tbp = %s\n", blkbuf);
 
 	(void) printf("\tmmp_magic = %016llx\n",
 	    (u_longlong_t)ub->ub_mmp_magic);
 	if (MMP_VALID(ub)) {
 		(void) printf("\tmmp_delay = %0llu\n",
 		    (u_longlong_t)ub->ub_mmp_delay);
 		if (MMP_SEQ_VALID(ub))
 			(void) printf("\tmmp_seq = %u\n",
 			    (unsigned int) MMP_SEQ(ub));
 		if (MMP_FAIL_INT_VALID(ub))
 			(void) printf("\tmmp_fail = %u\n",
 			    (unsigned int) MMP_FAIL_INT(ub));
 		if (MMP_INTERVAL_VALID(ub))
 			(void) printf("\tmmp_write = %u\n",
 			    (unsigned int) MMP_INTERVAL(ub));
 		/* After MMP_* to make summarize_uberblock_mmp cleaner */
 		(void) printf("\tmmp_valid = %x\n",
 		    (unsigned int) ub->ub_mmp_config & 0xFF);
 	}
 
 	if (dump_opt['u'] >= 4) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
 	(void) printf("\tcheckpoint_txg = %llu\n",
 	    (u_longlong_t)ub->ub_checkpoint_txg);
 
 	(void) printf("\traidz_reflow state=%u off=%llu\n",
 	    (int)RRSS_GET_STATE(ub),
 	    (u_longlong_t)RRSS_GET_OFFSET(ub));
 
 	(void) printf("%s", footer ? footer : "");
 }
 
 static void
 dump_config(spa_t *spa)
 {
 	dmu_buf_t *db;
 	size_t nvsize = 0;
 	int error = 0;
 
 
 	error = dmu_bonus_hold(spa->spa_meta_objset,
 	    spa->spa_config_object, FTAG, &db);
 
 	if (error == 0) {
 		nvsize = *(uint64_t *)db->db_data;
 		dmu_buf_rele(db, FTAG);
 
 		(void) printf("\nMOS Configuration:\n");
 		dump_packed_nvlist(spa->spa_meta_objset,
 		    spa->spa_config_object, (void *)&nvsize, 1);
 	} else {
 		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
 		    (u_longlong_t)spa->spa_config_object, error);
 	}
 }
 
 static void
 dump_cachefile(const char *cachefile)
 {
 	int fd;
 	struct stat64 statbuf;
 	char *buf;
 	nvlist_t *config;
 
 	if ((fd = open64(cachefile, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", cachefile,
 		    strerror(errno));
 		zdb_exit(1);
 	}
 
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", cachefile,
 		    strerror(errno));
 		zdb_exit(1);
 	}
 
 	if ((buf = malloc(statbuf.st_size)) == NULL) {
 		(void) fprintf(stderr, "failed to allocate %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		zdb_exit(1);
 	}
 
 	if (read(fd, buf, statbuf.st_size) != statbuf.st_size) {
 		(void) fprintf(stderr, "failed to read %llu bytes\n",
 		    (u_longlong_t)statbuf.st_size);
 		zdb_exit(1);
 	}
 
 	(void) close(fd);
 
 	if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) {
 		(void) fprintf(stderr, "failed to unpack nvlist\n");
 		zdb_exit(1);
 	}
 
 	free(buf);
 
 	dump_nvlist(config, 0);
 
 	nvlist_free(config);
 }
 
 /*
  * ZFS label nvlist stats
  */
 typedef struct zdb_nvl_stats {
 	int		zns_list_count;
 	int		zns_leaf_count;
 	size_t		zns_leaf_largest;
 	size_t		zns_leaf_total;
 	nvlist_t	*zns_string;
 	nvlist_t	*zns_uint64;
 	nvlist_t	*zns_boolean;
 } zdb_nvl_stats_t;
 
 static void
 collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats)
 {
 	nvlist_t *list, **array;
 	nvpair_t *nvp = NULL;
 	const char *name;
 	uint_t i, items;
 
 	stats->zns_list_count++;
 
 	while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
 		name = nvpair_name(nvp);
 
 		switch (nvpair_type(nvp)) {
 		case DATA_TYPE_STRING:
 			fnvlist_add_string(stats->zns_string, name,
 			    fnvpair_value_string(nvp));
 			break;
 		case DATA_TYPE_UINT64:
 			fnvlist_add_uint64(stats->zns_uint64, name,
 			    fnvpair_value_uint64(nvp));
 			break;
 		case DATA_TYPE_BOOLEAN:
 			fnvlist_add_boolean(stats->zns_boolean, name);
 			break;
 		case DATA_TYPE_NVLIST:
 			if (nvpair_value_nvlist(nvp, &list) == 0)
 				collect_nvlist_stats(list, stats);
 			break;
 		case DATA_TYPE_NVLIST_ARRAY:
 			if (nvpair_value_nvlist_array(nvp, &array, &items) != 0)
 				break;
 
 			for (i = 0; i < items; i++) {
 				collect_nvlist_stats(array[i], stats);
 
 				/* collect stats on leaf vdev */
 				if (strcmp(name, "children") == 0) {
 					size_t size;
 
 					(void) nvlist_size(array[i], &size,
 					    NV_ENCODE_XDR);
 					stats->zns_leaf_total += size;
 					if (size > stats->zns_leaf_largest)
 						stats->zns_leaf_largest = size;
 					stats->zns_leaf_count++;
 				}
 			}
 			break;
 		default:
 			(void) printf("skip type %d!\n", (int)nvpair_type(nvp));
 		}
 	}
 }
 
 static void
 dump_nvlist_stats(nvlist_t *nvl, size_t cap)
 {
 	zdb_nvl_stats_t stats = { 0 };
 	size_t size, sum = 0, total;
 	size_t noise;
 
 	/* requires nvlist with non-unique names for stat collection */
 	VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0));
 	VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0));
 	VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR));
 
 	(void) printf("\n\nZFS Label NVList Config Stats:\n");
 
 	VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR));
 	(void) printf("  %d bytes used, %d bytes free (using %4.1f%%)\n\n",
 	    (int)total, (int)(cap - total), 100.0 * total / cap);
 
 	collect_nvlist_stats(nvl, &stats);
 
 	VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:",
 	    (int)fnvlist_num_pairs(stats.zns_uint64),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:",
 	    (int)fnvlist_num_pairs(stats.zns_string),
 	    (int)size, 100.0 * size / total);
 
 	VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR));
 	size -= noise;
 	sum += size;
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:",
 	    (int)fnvlist_num_pairs(stats.zns_boolean),
 	    (int)size, 100.0 * size / total);
 
 	size = total - sum;	/* treat remainder as nvlist overhead */
 	(void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:",
 	    stats.zns_list_count, (int)size, 100.0 * size / total);
 
 	if (stats.zns_leaf_count > 0) {
 		size_t average = stats.zns_leaf_total / stats.zns_leaf_count;
 
 		(void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:",
 		    stats.zns_leaf_count, (int)average);
 		(void) printf("%24d bytes largest\n",
 		    (int)stats.zns_leaf_largest);
 
 		if (dump_opt['l'] >= 3 && average > 0)
 			(void) printf("  space for %d additional leaf vdevs\n",
 			    (int)((cap - total) / average));
 	}
 	(void) printf("\n");
 
 	nvlist_free(stats.zns_string);
 	nvlist_free(stats.zns_uint64);
 	nvlist_free(stats.zns_boolean);
 }
 
 typedef struct cksum_record {
 	zio_cksum_t cksum;
 	boolean_t labels[VDEV_LABELS];
 	avl_node_t link;
 } cksum_record_t;
 
 static int
 cksum_record_compare(const void *x1, const void *x2)
 {
 	const cksum_record_t *l = (cksum_record_t *)x1;
 	const cksum_record_t *r = (cksum_record_t *)x2;
 	int arraysize = ARRAY_SIZE(l->cksum.zc_word);
 	int difference = 0;
 
 	for (int i = 0; i < arraysize; i++) {
 		difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]);
 		if (difference)
 			break;
 	}
 
 	return (difference);
 }
 
 static cksum_record_t *
 cksum_record_alloc(zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL);
 	rec->cksum = *cksum;
 	rec->labels[l] = B_TRUE;
 
 	return (rec);
 }
 
 static cksum_record_t *
 cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum)
 {
 	cksum_record_t lookup = { .cksum = *cksum };
 	avl_index_t where;
 
 	return (avl_find(tree, &lookup, &where));
 }
 
 static cksum_record_t *
 cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l)
 {
 	cksum_record_t *rec;
 
 	rec = cksum_record_lookup(tree, cksum);
 	if (rec) {
 		rec->labels[l] = B_TRUE;
 	} else {
 		rec = cksum_record_alloc(cksum, l);
 		avl_add(tree, rec);
 	}
 
 	return (rec);
 }
 
 static int
 first_label(cksum_record_t *rec)
 {
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i])
 			return (i);
 
 	return (-1);
 }
 
 static void
 print_label_numbers(const char *prefix, const cksum_record_t *rec)
 {
 	fputs(prefix, stdout);
 	for (int i = 0; i < VDEV_LABELS; i++)
 		if (rec->labels[i] == B_TRUE)
 			printf("%d ", i);
 	putchar('\n');
 }
 
 #define	MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT)
 
 typedef struct zdb_label {
 	vdev_label_t label;
 	uint64_t label_offset;
 	nvlist_t *config_nv;
 	cksum_record_t *config;
 	cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT];
 	boolean_t header_printed;
 	boolean_t read_failed;
 	boolean_t cksum_valid;
 } zdb_label_t;
 
 static void
 print_label_header(zdb_label_t *label, int l)
 {
 
 	if (dump_opt['q'])
 		return;
 
 	if (label->header_printed == B_TRUE)
 		return;
 
 	(void) printf("------------------------------------\n");
 	(void) printf("LABEL %d %s\n", l,
 	    label->cksum_valid ? "" : "(Bad label cksum)");
 	(void) printf("------------------------------------\n");
 
 	label->header_printed = B_TRUE;
 }
 
 static void
 print_l2arc_header(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device header\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 print_l2arc_log_blocks(void)
 {
 	(void) printf("------------------------------------\n");
 	(void) printf("L2ARC device log blocks\n");
 	(void) printf("------------------------------------\n");
 }
 
 static void
 dump_l2arc_log_entries(uint64_t log_entries,
     l2arc_log_ent_phys_t *le, uint64_t i)
 {
 	for (int j = 0; j < log_entries; j++) {
 		dva_t dva = le[j].le_dva;
 		(void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, "
 		    "vdev: %llu, offset: %llu\n",
 		    (u_longlong_t)i, j + 1,
 		    (u_longlong_t)DVA_GET_ASIZE(&dva),
 		    (u_longlong_t)DVA_GET_VDEV(&dva),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva));
 		(void) printf("|\t\t\t\tbirth: %llu\n",
 		    (u_longlong_t)le[j].le_birth);
 		(void) printf("|\t\t\t\tlsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tpsize: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcompr: %llu\n",
 		    (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tcomplevel: %llu\n",
 		    (u_longlong_t)(&le[j])->le_complevel);
 		(void) printf("|\t\t\t\ttype: %llu\n",
 		    (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprotected: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop));
 		(void) printf("|\t\t\t\tprefetch: %llu\n",
 		    (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop));
 		(void) printf("|\t\t\t\taddress: %llu\n",
 		    (u_longlong_t)le[j].le_daddr);
 		(void) printf("|\t\t\t\tARC state: %llu\n",
 		    (u_longlong_t)L2BLK_GET_STATE((&le[j])->le_prop));
 		(void) printf("|\n");
 	}
 	(void) printf("\n");
 }
 
 static void
 dump_l2arc_log_blkptr(const l2arc_log_blkptr_t *lbps)
 {
 	(void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps->lbp_daddr);
 	(void) printf("|\t\tpayload_asize: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_asize);
 	(void) printf("|\t\tpayload_start: %llu\n",
 	    (u_longlong_t)lbps->lbp_payload_start);
 	(void) printf("|\t\tlsize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_LSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tasize: %llu\n",
 	    (u_longlong_t)L2BLK_GET_PSIZE(lbps->lbp_prop));
 	(void) printf("|\t\tcompralgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_COMPRESS(lbps->lbp_prop));
 	(void) printf("|\t\tcksumalgo: %llu\n",
 	    (u_longlong_t)L2BLK_GET_CHECKSUM(lbps->lbp_prop));
 	(void) printf("|\n\n");
 }
 
 static void
 dump_l2arc_log_blocks(int fd, const l2arc_dev_hdr_phys_t *l2dhdr,
     l2arc_dev_hdr_phys_t *rebuild)
 {
 	l2arc_log_blk_phys_t this_lb;
 	uint64_t asize;
 	l2arc_log_blkptr_t lbps[2];
 	zio_cksum_t cksum;
 	int failed = 0;
 	l2arc_dev_t dev;
 
 	if (!dump_opt['q'])
 		print_l2arc_log_blocks();
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	dev.l2ad_evict = l2dhdr->dh_evict;
 	dev.l2ad_start = l2dhdr->dh_start;
 	dev.l2ad_end = l2dhdr->dh_end;
 
 	if (l2dhdr->dh_start_lbps[0].lbp_daddr == 0) {
 		/* no log blocks to read */
 		if (!dump_opt['q']) {
 			(void) printf("No log blocks to read\n");
 			(void) printf("\n");
 		}
 		return;
 	} else {
 		dev.l2ad_hand = lbps[0].lbp_daddr +
 		    L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 	}
 
 	dev.l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(&dev, &lbps[0]))
 			break;
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) {
 			if (!dump_opt['q']) {
 				(void) printf("Error while reading next log "
 				    "block\n\n");
 			}
 			break;
 		}
 
 		fletcher_4_native_varsize(&this_lb, asize, &cksum);
 		if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) {
 			failed++;
 			if (!dump_opt['q']) {
 				(void) printf("Invalid cksum\n");
 				dump_l2arc_log_blkptr(&lbps[0]);
 			}
 			break;
 		}
 
 		switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) {
 		case ZIO_COMPRESS_OFF:
 			break;
 		default: {
 			abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 			abd_copy_from_buf_off(abd, &this_lb, 0, asize);
 			abd_t dabd;
 			abd_get_from_buf_struct(&dabd, &this_lb,
 			    sizeof (this_lb));
 			int err = zio_decompress_data(L2BLK_GET_COMPRESS(
 			    (&lbps[0])->lbp_prop), abd, &dabd,
 			    asize, sizeof (this_lb), NULL);
 			abd_free(&dabd);
 			abd_free(abd);
 			if (err != 0) {
 				(void) printf("L2ARC block decompression "
 				    "failed\n");
 				goto out;
 			}
 			break;
 		}
 		}
 
 		if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 			byteswap_uint64_array(&this_lb, sizeof (this_lb));
 		if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) {
 			if (!dump_opt['q'])
 				(void) printf("Invalid log block magic\n\n");
 			break;
 		}
 
 		rebuild->dh_lb_count++;
 		rebuild->dh_lb_asize += asize;
 		if (dump_opt['l'] > 1 && !dump_opt['q']) {
 			(void) printf("lb[%4llu]\tmagic: %llu\n",
 			    (u_longlong_t)rebuild->dh_lb_count,
 			    (u_longlong_t)this_lb.lb_magic);
 			dump_l2arc_log_blkptr(&lbps[0]);
 		}
 
 		if (dump_opt['l'] > 2 && !dump_opt['q'])
 			dump_l2arc_log_entries(l2dhdr->dh_log_entries,
 			    this_lb.lb_entries,
 			    rebuild->dh_lb_count);
 
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev.l2ad_evict) &&
 		    !dev.l2ad_first)
 			break;
 
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb.lb_prev_lbp;
 	}
 out:
 	if (!dump_opt['q']) {
 		(void) printf("log_blk_count:\t %llu with valid cksum\n",
 		    (u_longlong_t)rebuild->dh_lb_count);
 		(void) printf("\t\t %d with invalid cksum\n", failed);
 		(void) printf("log_blk_asize:\t %llu\n\n",
 		    (u_longlong_t)rebuild->dh_lb_asize);
 	}
 }
 
 static int
 dump_l2arc_header(int fd)
 {
 	l2arc_dev_hdr_phys_t l2dhdr = {0}, rebuild = {0};
 	int error = B_FALSE;
 
 	if (pread64(fd, &l2dhdr, sizeof (l2dhdr),
 	    VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) {
 		error = B_TRUE;
 	} else {
 		if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 			byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr));
 
 		if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC)
 			error = B_TRUE;
 	}
 
 	if (error) {
 		(void) printf("L2ARC device header not found\n\n");
 		/* Do not return an error here for backward compatibility */
 		return (0);
 	} else if (!dump_opt['q']) {
 		print_l2arc_header();
 
 		(void) printf("    magic: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_magic);
 		(void) printf("    version: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_version);
 		(void) printf("    pool_guid: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_spa_guid);
 		(void) printf("    flags: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_flags);
 		(void) printf("    start_lbps[0]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[0].lbp_daddr);
 		(void) printf("    start_lbps[1]: %llu\n",
 		    (u_longlong_t)
 		    l2dhdr.dh_start_lbps[1].lbp_daddr);
 		(void) printf("    log_blk_ent: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_log_entries);
 		(void) printf("    start: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_start);
 		(void) printf("    end: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_end);
 		(void) printf("    evict: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_evict);
 		(void) printf("    lb_asize_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_asize);
 		(void) printf("    lb_count_refcount: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_lb_count);
 		(void) printf("    trim_action_time: %llu\n",
 		    (u_longlong_t)l2dhdr.dh_trim_action_time);
 		(void) printf("    trim_state: %llu\n\n",
 		    (u_longlong_t)l2dhdr.dh_trim_state);
 	}
 
 	dump_l2arc_log_blocks(fd, &l2dhdr, &rebuild);
 	/*
 	 * The total aligned size of log blocks and the number of log blocks
 	 * reported in the header of the device may be less than what zdb
 	 * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild().
 	 * This happens because dump_l2arc_log_blocks() lacks the memory
 	 * pressure valve that l2arc_rebuild() has. Thus, if we are on a system
 	 * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize
 	 * and dh_lb_count will be lower to begin with than what exists on the
 	 * device. This is normal and zdb should not exit with an error. The
 	 * opposite case should never happen though, the values reported in the
 	 * header should never be higher than what dump_l2arc_log_blocks() and
 	 * l2arc_rebuild() report. If this happens there is a leak in the
 	 * accounting of log blocks.
 	 */
 	if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize ||
 	    l2dhdr.dh_lb_count > rebuild.dh_lb_count)
 		return (1);
 
 	return (0);
 }
 
 static void
 dump_config_from_label(zdb_label_t *label, size_t buflen, int l)
 {
 	if (dump_opt['q'])
 		return;
 
 	if ((dump_opt['l'] < 3) && (first_label(label->config) != l))
 		return;
 
 	print_label_header(label, l);
 	dump_nvlist(label->config_nv, 4);
 	print_label_numbers("    labels = ", label->config);
 
 	if (dump_opt['l'] >= 2)
 		dump_nvlist_stats(label->config_nv, buflen);
 }
 
 #define	ZDB_MAX_UB_HEADER_SIZE 32
 
 static void
 dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num)
 {
 
 	vdev_t vd;
 	char header[ZDB_MAX_UB_HEADER_SIZE];
 
 	vd.vdev_ashift = ashift;
 	vd.vdev_top = &vd;
 
 	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 		uberblock_t *ub = (void *)((char *)&label->label + uoff);
 		cksum_record_t *rec = label->uberblocks[i];
 
 		if (rec == NULL) {
 			if (dump_opt['u'] >= 2) {
 				print_label_header(label, label_num);
 				(void) printf("    Uberblock[%d] invalid\n", i);
 			}
 			continue;
 		}
 
 		if ((dump_opt['u'] < 3) && (first_label(rec) != label_num))
 			continue;
 
 		if ((dump_opt['u'] < 4) &&
 		    (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay &&
 		    (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL))
 			continue;
 
 		print_label_header(label, label_num);
 		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
 		    "    Uberblock[%d]\n", i);
 		dump_uberblock(ub, header, "");
 		print_label_numbers("        labels = ", rec);
 	}
 }
 
 static char curpath[PATH_MAX];
 
 /*
  * Iterate through the path components, recursively passing
  * current one's obj and remaining path until we find the obj
  * for the last one.
  */
 static int
 dump_path_impl(objset_t *os, uint64_t obj, char *name, uint64_t *retobj)
 {
 	int err;
 	boolean_t header = B_TRUE;
 	uint64_t child_obj;
 	char *s;
 	dmu_buf_t *db;
 	dmu_object_info_t doi;
 
 	if ((s = strchr(name, '/')) != NULL)
 		*s = '\0';
 	err = zap_lookup(os, obj, name, 8, 1, &child_obj);
 
 	(void) strlcat(curpath, name, sizeof (curpath));
 
 	if (err != 0) {
 		(void) fprintf(stderr, "failed to lookup %s: %s\n",
 		    curpath, strerror(err));
 		return (err);
 	}
 
 	child_obj = ZFS_DIRENT_OBJ(child_obj);
 	err = sa_buf_hold(os, child_obj, FTAG, &db);
 	if (err != 0) {
 		(void) fprintf(stderr,
 		    "failed to get SA dbuf for obj %llu: %s\n",
 		    (u_longlong_t)child_obj, strerror(err));
 		return (EINVAL);
 	}
 	dmu_object_info_from_db(db, &doi);
 	sa_buf_rele(db, FTAG);
 
 	if (doi.doi_bonus_type != DMU_OT_SA &&
 	    doi.doi_bonus_type != DMU_OT_ZNODE) {
 		(void) fprintf(stderr, "invalid bonus type %d for obj %llu\n",
 		    doi.doi_bonus_type, (u_longlong_t)child_obj);
 		return (EINVAL);
 	}
 
 	if (dump_opt['v'] > 6) {
 		(void) printf("obj=%llu %s type=%d bonustype=%d\n",
 		    (u_longlong_t)child_obj, curpath, doi.doi_type,
 		    doi.doi_bonus_type);
 	}
 
 	(void) strlcat(curpath, "/", sizeof (curpath));
 
 	switch (doi.doi_type) {
 	case DMU_OT_DIRECTORY_CONTENTS:
 		if (s != NULL && *(s + 1) != '\0')
 			return (dump_path_impl(os, child_obj, s + 1, retobj));
 		zfs_fallthrough;
 	case DMU_OT_PLAIN_FILE_CONTENTS:
 		if (retobj != NULL) {
 			*retobj = child_obj;
 		} else {
 			dump_object(os, child_obj, dump_opt['v'], &header,
 			    NULL, 0);
 		}
 		return (0);
 	default:
 		(void) fprintf(stderr, "object %llu has non-file/directory "
 		    "type %d\n", (u_longlong_t)obj, doi.doi_type);
 		break;
 	}
 
 	return (EINVAL);
 }
 
 /*
  * Dump the blocks for the object specified by path inside the dataset.
  */
 static int
 dump_path(char *ds, char *path, uint64_t *retobj)
 {
 	int err;
 	objset_t *os;
 	uint64_t root_obj;
 
 	err = open_objset(ds, FTAG, &os);
 	if (err != 0)
 		return (err);
 
 	err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj);
 	if (err != 0) {
 		(void) fprintf(stderr, "can't lookup root znode: %s\n",
 		    strerror(err));
 		close_objset(os, FTAG);
 		return (EINVAL);
 	}
 
 	(void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds);
 
 	err = dump_path_impl(os, root_obj, path, retobj);
 
 	close_objset(os, FTAG);
 	return (err);
 }
 
 static int
 dump_backup_bytes(objset_t *os, void *buf, int len, void *arg)
 {
 	const char *p = (const char *)buf;
 	ssize_t nwritten;
 
 	(void) os;
 	(void) arg;
 
 	/* Write the data out, handling short writes and signals. */
 	while ((nwritten = write(STDOUT_FILENO, p, len)) < len) {
 		if (nwritten < 0) {
 			if (errno == EINTR)
 				continue;
 			return (errno);
 		}
 		p += nwritten;
 		len -= nwritten;
 	}
 
 	return (0);
 }
 
 static void
 dump_backup(const char *pool, uint64_t objset_id, const char *flagstr)
 {
 	boolean_t embed = B_FALSE;
 	boolean_t large_block = B_FALSE;
 	boolean_t compress = B_FALSE;
 	boolean_t raw = B_FALSE;
 
 	const char *c;
 	for (c = flagstr; c != NULL && *c != '\0'; c++) {
 		switch (*c) {
 			case 'e':
 				embed = B_TRUE;
 				break;
 			case 'L':
 				large_block = B_TRUE;
 				break;
 			case 'c':
 				compress = B_TRUE;
 				break;
 			case 'w':
 				raw = B_TRUE;
 				break;
 			default:
 				fprintf(stderr, "dump_backup: invalid flag "
 				    "'%c'\n", *c);
 				return;
 		}
 	}
 
 	if (isatty(STDOUT_FILENO)) {
 		fprintf(stderr, "dump_backup: stream cannot be written "
 		    "to a terminal\n");
 		return;
 	}
 
 	offset_t off = 0;
 	dmu_send_outparams_t out = {
 	    .dso_outfunc = dump_backup_bytes,
 	    .dso_dryrun  = B_FALSE,
 	};
 
 	int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed,
 	    large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO,
 	    &off, &out);
 	if (err != 0) {
 		fprintf(stderr, "dump_backup: dmu_send_obj: %s\n",
 		    strerror(err));
 		return;
 	}
 }
 
 static int
 zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile)
 {
 	int err = 0;
 	uint64_t size, readsize, oursize, offset;
 	ssize_t writesize;
 	sa_handle_t *hdl;
 
 	(void) printf("Copying object %" PRIu64 " to file %s\n", srcobj,
 	    destfile);
 
 	VERIFY3P(os, ==, sa_os);
 	if ((err = sa_handle_get(os, srcobj, NULL, SA_HDL_PRIVATE, &hdl))) {
 		(void) printf("Failed to get handle for SA znode\n");
 		return (err);
 	}
 	if ((err = sa_lookup(hdl, sa_attr_table[ZPL_SIZE], &size, 8))) {
 		(void) sa_handle_destroy(hdl);
 		return (err);
 	}
 	(void) sa_handle_destroy(hdl);
 
 	(void) printf("Object %" PRIu64 " is %" PRIu64 " bytes\n", srcobj,
 	    size);
 	if (size == 0) {
 		return (EINVAL);
 	}
 
 	int fd = open(destfile, O_WRONLY | O_CREAT | O_TRUNC, 0644);
 	if (fd == -1)
 		return (errno);
 	/*
 	 * We cap the size at 1 mebibyte here to prevent
 	 * allocation failures and nigh-infinite printing if the
 	 * object is extremely large.
 	 */
 	oursize = MIN(size, 1 << 20);
 	offset = 0;
 	char *buf = kmem_alloc(oursize, KM_NOSLEEP);
 	if (buf == NULL) {
 		(void) close(fd);
 		return (ENOMEM);
 	}
 
 	while (offset < size) {
 		readsize = MIN(size - offset, 1 << 20);
 		err = dmu_read(os, srcobj, offset, readsize, buf, 0);
 		if (err != 0) {
 			(void) printf("got error %u from dmu_read\n", err);
 			kmem_free(buf, oursize);
 			(void) close(fd);
 			return (err);
 		}
 		if (dump_opt['v'] > 3) {
 			(void) printf("Read offset=%" PRIu64 " size=%" PRIu64
 			    " error=%d\n", offset, readsize, err);
 		}
 
 		writesize = write(fd, buf, readsize);
 		if (writesize < 0) {
 			err = errno;
 			break;
 		} else if (writesize != readsize) {
 			/* Incomplete write */
 			(void) fprintf(stderr, "Short write, only wrote %llu of"
 			    " %" PRIu64 " bytes, exiting...\n",
 			    (u_longlong_t)writesize, readsize);
 			break;
 		}
 
 		offset += readsize;
 	}
 
 	(void) close(fd);
 
 	if (buf != NULL)
 		kmem_free(buf, oursize);
 
 	return (err);
 }
 
 static boolean_t
 label_cksum_valid(vdev_label_t *label, uint64_t offset)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
 	zio_cksum_t expected_cksum;
 	zio_cksum_t actual_cksum;
 	zio_cksum_t verifier;
 	zio_eck_t *eck;
 	int byteswap;
 
 	void *data = (char *)label + offsetof(vdev_label_t, vl_vdev_phys);
 	eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1;
 
 	offset += offsetof(vdev_label_t, vl_vdev_phys);
 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
 
 	byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 	if (byteswap)
 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 	expected_cksum = eck->zec_cksum;
 	eck->zec_cksum = verifier;
 
 	abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE);
 	ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum);
 	abd_free(abd);
 
 	if (byteswap)
 		byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t));
 
 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 static int
 dump_label(const char *dev)
 {
 	char path[MAXPATHLEN];
 	zdb_label_t labels[VDEV_LABELS] = {{{{0}}}};
 	uint64_t psize, ashift, l2cache;
 	struct stat64 statbuf;
 	boolean_t config_found = B_FALSE;
 	boolean_t error = B_FALSE;
 	boolean_t read_l2arc_header = B_FALSE;
 	avl_tree_t config_tree;
 	avl_tree_t uberblock_tree;
 	void *node, *cookie;
 	int fd;
 
 	/*
 	 * Check if we were given absolute path and use it as is.
 	 * Otherwise if the provided vdev name doesn't point to a file,
 	 * try prepending expected disk paths and partition numbers.
 	 */
 	(void) strlcpy(path, dev, sizeof (path));
 	if (dev[0] != '/' && stat64(path, &statbuf) != 0) {
 		int error;
 
 		error = zfs_resolve_shortname(dev, path, MAXPATHLEN);
 		if (error == 0 && zfs_dev_is_whole_disk(path)) {
 			if (zfs_append_partition(path, MAXPATHLEN) == -1)
 				error = ENOENT;
 		}
 
 		if (error || (stat64(path, &statbuf) != 0)) {
 			(void) printf("failed to find device %s, try "
 			    "specifying absolute path instead\n", dev);
 			return (1);
 		}
 	}
 
 	if ((fd = open64(path, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", path, strerror(errno));
 		zdb_exit(1);
 	}
 
 	if (fstat64_blk(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", path,
 		    strerror(errno));
 		(void) close(fd);
 		zdb_exit(1);
 	}
 
 	if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0)
 		(void) printf("failed to invalidate cache '%s' : %s\n", path,
 		    strerror(errno));
 
 	avl_create(&config_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 	avl_create(&uberblock_tree, cksum_record_compare,
 	    sizeof (cksum_record_t), offsetof(cksum_record_t, link));
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN_TYPED(psize, sizeof (vdev_label_t), uint64_t);
 	ashift = SPA_MINBLOCKSHIFT;
 
 	/*
 	 * 1. Read the label from disk
 	 * 2. Verify label cksum
 	 * 3. Unpack the configuration and insert in config tree.
 	 * 4. Traverse all uberblocks and insert in uberblock tree.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		char *buf = label->label.vl_vdev_phys.vp_nvlist;
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 		nvlist_t *config;
 		cksum_record_t *rec;
 		zio_cksum_t cksum;
 		vdev_t vd;
 
 		label->label_offset = vdev_label_offset(psize, l, 0);
 
 		if (pread64(fd, &label->label, sizeof (label->label),
 		    label->label_offset) != sizeof (label->label)) {
 			if (!dump_opt['q'])
 				(void) printf("failed to read label %d\n", l);
 			label->read_failed = B_TRUE;
 			error = B_TRUE;
 			continue;
 		}
 
 		label->read_failed = B_FALSE;
 		label->cksum_valid = label_cksum_valid(&label->label,
 		    label->label_offset);
 
 		if (nvlist_unpack(buf, buflen, &config, 0) == 0) {
 			nvlist_t *vdev_tree = NULL;
 			size_t size;
 
 			if ((nvlist_lookup_nvlist(config,
 			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
 			    (nvlist_lookup_uint64(vdev_tree,
 			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
 				ashift = SPA_MINBLOCKSHIFT;
 
 			if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0)
 				size = buflen;
 
 			/* If the device is a cache device read the header. */
 			if (!read_l2arc_header) {
 				if (nvlist_lookup_uint64(config,
 				    ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 &&
 				    l2cache == POOL_STATE_L2CACHE) {
 					read_l2arc_header = B_TRUE;
 				}
 			}
 
 			fletcher_4_native_varsize(buf, size, &cksum);
 			rec = cksum_record_insert(&config_tree, &cksum, l);
 
 			label->config = rec;
 			label->config_nv = config;
 			config_found = B_TRUE;
 		} else {
 			error = B_TRUE;
 		}
 
 		vd.vdev_ashift = ashift;
 		vd.vdev_top = &vd;
 
 		for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) {
 			uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i);
 			uberblock_t *ub = (void *)((char *)label + uoff);
 
 			if (uberblock_verify(ub))
 				continue;
 
 			fletcher_4_native_varsize(ub, sizeof (*ub), &cksum);
 			rec = cksum_record_insert(&uberblock_tree, &cksum, l);
 
 			label->uberblocks[i] = rec;
 		}
 	}
 
 	/*
 	 * Dump the label and uberblocks.
 	 */
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zdb_label_t *label = &labels[l];
 		size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist);
 
 		if (label->read_failed == B_TRUE)
 			continue;
 
 		if (label->config_nv) {
 			dump_config_from_label(label, buflen, l);
 		} else {
 			if (!dump_opt['q'])
 				(void) printf("failed to unpack label %d\n", l);
 		}
 
 		if (dump_opt['u'])
 			dump_label_uberblocks(label, ashift, l);
 
 		nvlist_free(label->config_nv);
 	}
 
 	/*
 	 * Dump the L2ARC header, if existent.
 	 */
 	if (read_l2arc_header)
 		error |= dump_l2arc_header(fd);
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	cookie = NULL;
 	while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL)
 		umem_free(node, sizeof (cksum_record_t));
 
 	avl_destroy(&config_tree);
 	avl_destroy(&uberblock_tree);
 
 	(void) close(fd);
 
 	return (config_found == B_FALSE ? 2 :
 	    (error == B_TRUE ? 1 : 0));
 }
 
 static uint64_t dataset_feature_count[SPA_FEATURES];
 static uint64_t global_feature_count[SPA_FEATURES];
 static uint64_t remap_deadlist_count = 0;
 
 static int
 dump_one_objset(const char *dsname, void *arg)
 {
 	(void) arg;
 	int error;
 	objset_t *os;
 	spa_feature_t f;
 
 	error = open_objset(dsname, FTAG, &os);
 	if (error != 0)
 		return (0);
 
 	for (f = 0; f < SPA_FEATURES; f++) {
 		if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f))
 			continue;
 		ASSERT(spa_feature_table[f].fi_flags &
 		    ZFEATURE_FLAG_PER_DATASET);
 		dataset_feature_count[f]++;
 	}
 
 	if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) {
 		remap_deadlist_count++;
 	}
 
 	for (dsl_bookmark_node_t *dbn =
 	    avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL;
 	    dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) {
 		mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj);
 		if (dbn->dbn_phys.zbm_redaction_obj != 0) {
 			global_feature_count[
 			    SPA_FEATURE_REDACTION_BOOKMARKS]++;
 			objset_t *mos = os->os_spa->spa_meta_objset;
 			dnode_t *rl;
 			VERIFY0(dnode_hold(mos,
 			    dbn->dbn_phys.zbm_redaction_obj, FTAG, &rl));
 			if (rl->dn_have_spill) {
 				global_feature_count[
 				    SPA_FEATURE_REDACTION_LIST_SPILL]++;
 			}
 		}
 		if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)
 			global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++;
 	}
 
 	if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) &&
 	    !dmu_objset_is_snapshot(os)) {
 		global_feature_count[SPA_FEATURE_LIVELIST]++;
 	}
 
 	dump_objset(os);
 	close_objset(os, FTAG);
 	fuid_table_destroy();
 	return (0);
 }
 
 /*
  * Block statistics.
  */
 #define	PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2)
 typedef struct zdb_blkstats {
 	uint64_t zb_asize;
 	uint64_t zb_lsize;
 	uint64_t zb_psize;
 	uint64_t zb_count;
 	uint64_t zb_gangs;
 	uint64_t zb_ditto_samevdev;
 	uint64_t zb_ditto_same_ms;
 	uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE];
 } zdb_blkstats_t;
 
 /*
  * Extended object types to report deferred frees and dedup auto-ditto blocks.
  */
 #define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
 #define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
 #define	ZDB_OT_OTHER	(DMU_OT_NUMTYPES + 2)
 #define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 3)
 
 static const char *zdb_ot_extname[] = {
 	"deferred free",
 	"dedup ditto",
 	"other",
 	"Total",
 };
 
 #define	ZB_TOTAL	DN_MAX_LEVELS
 #define	SPA_MAX_FOR_16M	(SPA_MAXBLOCKSHIFT+1)
 
 typedef struct zdb_brt_entry {
 	dva_t		zbre_dva;
 	uint64_t	zbre_refcount;
 	avl_node_t	zbre_node;
 } zdb_brt_entry_t;
 
 typedef struct zdb_cb {
 	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
 	uint64_t	zcb_removing_size;
 	uint64_t	zcb_checkpoint_size;
 	uint64_t	zcb_dedup_asize;
 	uint64_t	zcb_dedup_blocks;
 	uint64_t	zcb_clone_asize;
 	uint64_t	zcb_clone_blocks;
 	uint64_t	zcb_psize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_count[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_lsize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_asize_len[SPA_MAX_FOR_16M];
 	uint64_t	zcb_psize_total;
 	uint64_t	zcb_lsize_total;
 	uint64_t	zcb_asize_total;
 	uint64_t	zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES];
 	uint64_t	zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES]
 	    [BPE_PAYLOAD_SIZE + 1];
 	uint64_t	zcb_start;
 	hrtime_t	zcb_lastprint;
 	uint64_t	zcb_totalasize;
 	uint64_t	zcb_errors[256];
 	int		zcb_readfails;
 	int		zcb_haderrors;
 	spa_t		*zcb_spa;
 	uint32_t	**zcb_vd_obsolete_counts;
 	avl_tree_t	zcb_brt;
 	boolean_t	zcb_brt_is_active;
 } zdb_cb_t;
 
 /* test if two DVA offsets from same vdev are within the same metaslab */
 static boolean_t
 same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2)
 {
 	vdev_t *vd = vdev_lookup_top(spa, vdev);
 	uint64_t ms_shift = vd->vdev_ms_shift;
 
 	return ((off1 >> ms_shift) == (off2 >> ms_shift));
 }
 
 /*
  * Used to simplify reporting of the histogram data.
  */
 typedef struct one_histo {
 	const char *name;
 	uint64_t *count;
 	uint64_t *len;
 	uint64_t cumulative;
 } one_histo_t;
 
 /*
  * The number of separate histograms processed for psize, lsize and asize.
  */
 #define	NUM_HISTO 3
 
 /*
  * This routine will create a fixed column size output of three different
  * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M
  * the count, length and cumulative length of the psize, lsize and
  * asize blocks.
  *
  * All three types of blocks are listed on a single line
  *
  * By default the table is printed in nicenumber format (e.g. 123K) but
  * if the '-P' parameter is specified then the full raw number (parseable)
  * is printed out.
  */
 static void
 dump_size_histograms(zdb_cb_t *zcb)
 {
 	/*
 	 * A temporary buffer that allows us to convert a number into
 	 * a string using zdb_nicenumber to allow either raw or human
 	 * readable numbers to be output.
 	 */
 	char numbuf[32];
 
 	/*
 	 * Define titles which are used in the headers of the tables
 	 * printed by this routine.
 	 */
 	const char blocksize_title1[] = "block";
 	const char blocksize_title2[] = "size";
 	const char count_title[] = "Count";
 	const char length_title[] = "Size";
 	const char cumulative_title[] = "Cum.";
 
 	/*
 	 * Setup the histogram arrays (psize, lsize, and asize).
 	 */
 	one_histo_t parm_histo[NUM_HISTO];
 
 	parm_histo[0].name = "psize";
 	parm_histo[0].count = zcb->zcb_psize_count;
 	parm_histo[0].len = zcb->zcb_psize_len;
 	parm_histo[0].cumulative = 0;
 
 	parm_histo[1].name = "lsize";
 	parm_histo[1].count = zcb->zcb_lsize_count;
 	parm_histo[1].len = zcb->zcb_lsize_len;
 	parm_histo[1].cumulative = 0;
 
 	parm_histo[2].name = "asize";
 	parm_histo[2].count = zcb->zcb_asize_count;
 	parm_histo[2].len = zcb->zcb_asize_len;
 	parm_histo[2].cumulative = 0;
 
 
 	(void) printf("\nBlock Size Histogram\n");
 	/*
 	 * Print the first line titles
 	 */
 	if (dump_opt['P'])
 		(void) printf("\n%s\t", blocksize_title1);
 	else
 		(void) printf("\n%7s   ", blocksize_title1);
 
 	for (int j = 0; j < NUM_HISTO; j++) {
 		if (dump_opt['P']) {
 			if (j < NUM_HISTO - 1) {
 				(void) printf("%s\t\t\t", parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("  %s", parm_histo[j].name);
 			}
 		} else {
 			if (j < NUM_HISTO - 1) {
 				/* Left aligned strings in the output */
 				(void) printf("%-7s              ",
 				    parm_histo[j].name);
 			} else {
 				/* Don't print trailing spaces */
 				(void) printf("%s", parm_histo[j].name);
 			}
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the second line titles
 	 */
 	if (dump_opt['P']) {
 		(void) printf("%s\t", blocksize_title2);
 	} else {
 		(void) printf("%7s ", blocksize_title2);
 	}
 
 	for (int i = 0; i < NUM_HISTO; i++) {
 		if (dump_opt['P']) {
 			(void) printf("%s\t%s\t%s\t",
 			    count_title, length_title, cumulative_title);
 		} else {
 			(void) printf("%7s%7s%7s",
 			    count_title, length_title, cumulative_title);
 		}
 	}
 	(void) printf("\n");
 
 	/*
 	 * Print the rows
 	 */
 	for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) {
 
 		/*
 		 * Print the first column showing the blocksize
 		 */
 		zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf));
 
 		if (dump_opt['P']) {
 			printf("%s", numbuf);
 		} else {
 			printf("%7s:", numbuf);
 		}
 
 		/*
 		 * Print the remaining set of 3 columns per size:
 		 * for psize, lsize and asize
 		 */
 		for (int j = 0; j < NUM_HISTO; j++) {
 			parm_histo[j].cumulative += parm_histo[j].len[i];
 
 			zdb_nicenum(parm_histo[j].count[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].len[i],
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 
 			zdb_nicenum(parm_histo[j].cumulative,
 			    numbuf, sizeof (numbuf));
 			if (dump_opt['P'])
 				(void) printf("\t%s", numbuf);
 			else
 				(void) printf("%7s", numbuf);
 		}
 		(void) printf("\n");
 	}
 }
 
 static void
 zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
     dmu_object_type_t type)
 {
 	int i;
 
 	ASSERT(type < ZDB_OT_TOTAL);
 
 	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
 		return;
 
 	/*
 	 * This flag controls if we will issue a claim for the block while
 	 * counting it, to ensure that all blocks are referenced in space maps.
 	 * We don't issue claims if we're not doing leak tracking, because it's
 	 * expensive if the user isn't interested. We also don't claim the
 	 * second or later occurences of cloned or dedup'd blocks, because we
 	 * already claimed them the first time.
 	 */
 	boolean_t do_claim = !dump_opt['L'];
 
 	spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER);
 
 	blkptr_t tempbp;
 	if (BP_GET_DEDUP(bp)) {
 		/*
 		 * Dedup'd blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them once.
 		 *
 		 * We use the existing dedup system to track what we've seen.
 		 * The first time we see a block, we do a ddt_lookup() to see
 		 * if it exists in the DDT. If we're doing leak tracking, we
 		 * claim the block at this time.
 		 *
 		 * Each time we see a block, we reduce the refcount in the
 		 * entry by one, and add to the size and count of dedup'd
 		 * blocks to report at the end.
 		 */
 
 		ddt_t *ddt = ddt_select(zcb->zcb_spa, bp);
 
 		ddt_enter(ddt);
 
 		/*
 		 * Find the block. This will create the entry in memory, but
 		 * we'll know if that happened by its refcount.
 		 */
 		ddt_entry_t *dde = ddt_lookup(ddt, bp, B_TRUE);
 
 		/*
 		 * ddt_lookup() can return NULL if this block didn't exist
 		 * in the DDT and creating it would take the DDT over its
 		 * quota. Since we got the block from disk, it must exist in
 		 * the DDT, so this can't happen. However, when unique entries
 		 * are pruned, the dedup bit can be set with no corresponding
 		 * entry in the DDT.
 		 */
 		if (dde == NULL) {
 			ddt_exit(ddt);
 			goto skipped;
 		}
 
 		/* Get the phys for this variant */
 		ddt_phys_variant_t v = ddt_phys_select(ddt, dde, bp);
 
 		/*
 		 * This entry may have multiple sets of DVAs. We must claim
 		 * each set the first time we see them in a real block on disk,
 		 * or count them on subsequent occurences. We don't have a
 		 * convenient way to track the first time we see each variant,
 		 * so we repurpose dde_io as a set of "seen" flag bits. We can
 		 * do this safely in zdb because it never writes, so it will
 		 * never have a writing zio for this block in that pointer.
 		 */
 		boolean_t seen = !!(((uintptr_t)dde->dde_io) & (1 << v));
 		if (!seen)
 			dde->dde_io =
 			    (void *)(((uintptr_t)dde->dde_io) | (1 << v));
 
 		/* Consume a reference for this block. */
 		if (ddt_phys_total_refcnt(ddt, dde->dde_phys) > 0)
 			ddt_phys_decref(dde->dde_phys, v);
 
 		/*
 		 * If this entry has a single flat phys, it may have been
 		 * extended with additional DVAs at some time in its life.
 		 * This block might be from before it was fully extended, and
 		 * so have fewer DVAs.
 		 *
 		 * If this is the first time we've seen this block, and we
 		 * claimed it as-is, then we would miss the claim on some
 		 * number of DVAs, which would then be seen as leaked.
 		 *
 		 * In all cases, if we've had fewer DVAs, then the asize would
 		 * be too small, and would lead to the pool apparently using
 		 * more space than allocated.
 		 *
 		 * To handle this, we copy the canonical set of DVAs from the
 		 * entry back to the block pointer before we claim it.
 		 */
 		if (v == DDT_PHYS_FLAT) {
 			ASSERT3U(BP_GET_PHYSICAL_BIRTH(bp), ==,
 			    ddt_phys_birth(dde->dde_phys, v));
 			tempbp = *bp;
 			ddt_bp_fill(dde->dde_phys, v, &tempbp,
 			    BP_GET_PHYSICAL_BIRTH(bp));
 			bp = &tempbp;
 		}
 
 		if (seen) {
 			/*
 			 * The second or later time we see this block,
 			 * it's a duplicate and we count it.
 			 */
 			zcb->zcb_dedup_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_dedup_blocks++;
 
 			/* Already claimed, don't do it again. */
 			do_claim = B_FALSE;
 		}
 
 		ddt_exit(ddt);
 	} else if (zcb->zcb_brt_is_active &&
 	    brt_maybe_exists(zcb->zcb_spa, bp)) {
 		/*
 		 * Cloned blocks are special. We need to count them, so we can
 		 * later uncount them when reporting leaked space, and we must
 		 * only claim them once.
 		 *
 		 * To do this, we keep our own in-memory BRT. For each block
 		 * we haven't seen before, we look it up in the real BRT and
 		 * if its there, we note it and its refcount then proceed as
 		 * normal. If we see the block again, we count it as a clone
 		 * and then give it no further consideration.
 		 */
 		zdb_brt_entry_t zbre_search, *zbre;
 		avl_index_t where;
 
 		zbre_search.zbre_dva = bp->blk_dva[0];
 		zbre = avl_find(&zcb->zcb_brt, &zbre_search, &where);
 		if (zbre == NULL) {
 			/* Not seen before; track it */
 			uint64_t refcnt =
 			    brt_entry_get_refcount(zcb->zcb_spa, bp);
 			if (refcnt > 0) {
 				zbre = umem_zalloc(sizeof (zdb_brt_entry_t),
 				    UMEM_NOFAIL);
 				zbre->zbre_dva = bp->blk_dva[0];
 				zbre->zbre_refcount = refcnt;
 				avl_insert(&zcb->zcb_brt, zbre, where);
 			}
 		} else  {
 			/*
 			 * Second or later occurrence, count it and take a
 			 * refcount.
 			 */
 			zcb->zcb_clone_asize += BP_GET_ASIZE(bp);
 			zcb->zcb_clone_blocks++;
 
 			zbre->zbre_refcount--;
 			if (zbre->zbre_refcount == 0) {
 				avl_remove(&zcb->zcb_brt, zbre);
 				umem_free(zbre, sizeof (zdb_brt_entry_t));
 			}
 
 			/* Already claimed, don't do it again. */
 			do_claim = B_FALSE;
 		}
 	}
 
 skipped:
 	for (i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
 		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		int equal;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
 		zb->zb_lsize += BP_GET_LSIZE(bp);
 		zb->zb_psize += BP_GET_PSIZE(bp);
 		zb->zb_count++;
 
 		/*
 		 * The histogram is only big enough to record blocks up to
 		 * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last,
 		 * "other", bucket.
 		 */
 		unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT;
 		idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1);
 		zb->zb_psize_histogram[idx]++;
 
 		zb->zb_gangs += BP_COUNT_GANG(bp);
 
 		switch (BP_GET_NDVAS(bp)) {
 		case 2:
 			if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) {
 				zb->zb_ditto_samevdev++;
 
 				if (same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		case 3:
 			equal = (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[1])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2])) +
 			    (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 			    DVA_GET_VDEV(&bp->blk_dva[2]));
 			if (equal != 0) {
 				zb->zb_ditto_samevdev++;
 
 				if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[1]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[0]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[0]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 				else if (DVA_GET_VDEV(&bp->blk_dva[1]) ==
 				    DVA_GET_VDEV(&bp->blk_dva[2]) &&
 				    same_metaslab(zcb->zcb_spa,
 				    DVA_GET_VDEV(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[1]),
 				    DVA_GET_OFFSET(&bp->blk_dva[2])))
 					zb->zb_ditto_same_ms++;
 			}
 			break;
 		}
 	}
 
 	spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG);
 
 	if (BP_IS_EMBEDDED(bp)) {
 		zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++;
 		zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)]
 		    [BPE_GET_PSIZE(bp)]++;
 		return;
 	}
 	/*
 	 * The binning histogram bins by powers of two up to
 	 * SPA_MAXBLOCKSIZE rather than creating bins for
 	 * every possible blocksize found in the pool.
 	 */
 	int bin = highbit64(BP_GET_PSIZE(bp)) - 1;
 
 	zcb->zcb_psize_count[bin]++;
 	zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp);
 	zcb->zcb_psize_total += BP_GET_PSIZE(bp);
 
 	bin = highbit64(BP_GET_LSIZE(bp)) - 1;
 
 	zcb->zcb_lsize_count[bin]++;
 	zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp);
 	zcb->zcb_lsize_total += BP_GET_LSIZE(bp);
 
 	bin = highbit64(BP_GET_ASIZE(bp)) - 1;
 
 	zcb->zcb_asize_count[bin]++;
 	zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp);
 	zcb->zcb_asize_total += BP_GET_ASIZE(bp);
 
 	if (!do_claim)
 		return;
 
 	VERIFY0(zio_wait(zio_claim(NULL, zcb->zcb_spa,
 	    spa_min_claim_txg(zcb->zcb_spa), bp, NULL, NULL,
 	    ZIO_FLAG_CANFAIL)));
 }
 
 static void
 zdb_blkptr_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	blkptr_t *bp = zio->io_bp;
 	int ioerr = zio->io_error;
 	zdb_cb_t *zcb = zio->io_private;
 	zbookmark_phys_t *zb = &zio->io_bookmark;
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 
 	if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
 		char blkbuf[BP_SPRINTF_LEN];
 
 		zcb->zcb_haderrors = 1;
 		zcb->zcb_errors[ioerr]++;
 
 		if (dump_opt['b'] >= 2)
 			snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		else
 			blkbuf[0] = '\0';
 
 		(void) printf("zdb_blkptr_cb: "
 		    "Got error %d reading "
 		    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
 		    ioerr,
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (u_longlong_t)zb->zb_level,
 		    (u_longlong_t)zb->zb_blkid,
 		    blkbuf);
 	}
 	mutex_exit(&spa->spa_scrub_lock);
 
 	abd_free(zio->io_abd);
 }
 
 static int
 zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	dmu_object_type_t type;
 	boolean_t is_metadata;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL)
 		return (0);
 
 	if (dump_opt['b'] >= 5 && BP_GET_BIRTH(bp) > 0) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("objset %llu object %llu "
 		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
 		    (longlong_t)zb->zb_level,
 		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	type = BP_GET_TYPE(bp);
 
 	zdb_count_block(zcb, zilog, bp,
 	    (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type);
 
 	is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type));
 
 	if (!BP_IS_EMBEDDED(bp) &&
 	    (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) {
 		size_t size = BP_GET_PSIZE(bp);
 		abd_t *abd = abd_alloc(size, B_FALSE);
 		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
 
 		/* If it's an intent log block, failure is expected. */
 		if (zb->zb_level == ZB_ZIL_LEVEL)
 			flags |= ZIO_FLAG_SPECULATIVE;
 
 		mutex_enter(&spa->spa_scrub_lock);
 		while (spa->spa_load_verify_bytes > max_inflight_bytes)
 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 		spa->spa_load_verify_bytes += size;
 		mutex_exit(&spa->spa_scrub_lock);
 
 		zio_nowait(zio_read(NULL, spa, bp, abd, size,
 		    zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb));
 	}
 
 	zcb->zcb_readfails = 0;
 
 	/* only call gethrtime() every 100 blocks */
 	static int iters;
 	if (++iters > 100)
 		iters = 0;
 	else
 		return (0);
 
 	if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) {
 		uint64_t now = gethrtime();
 		char buf[10];
 		uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize;
 		uint64_t kb_per_sec =
 		    1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000));
 		uint64_t sec_remaining =
 		    (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec;
 
 		/* make sure nicenum has enough space */
 		_Static_assert(sizeof (buf) >= NN_NUMBUF_SZ, "buf truncated");
 
 		zfs_nicebytes(bytes, buf, sizeof (buf));
 		(void) fprintf(stderr,
 		    "\r%5s completed (%4"PRIu64"MB/s) "
 		    "estimated time remaining: "
 		    "%"PRIu64"hr %02"PRIu64"min %02"PRIu64"sec        ",
 		    buf, kb_per_sec / 1024,
 		    sec_remaining / 60 / 60,
 		    sec_remaining / 60 % 60,
 		    sec_remaining % 60);
 
 		zcb->zcb_lastprint = now;
 	}
 
 	return (0);
 }
 
 static void
 zdb_leak(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
 }
 
 static metaslab_ops_t zdb_metaslab_ops = {
 	NULL	/* alloc */
 };
 
 static int
 load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	spa_vdev_removal_t *svr = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 
 	/* skip vdevs we don't care about */
 	if (sme->sme_vdev != svr->svr_vdev_id)
 		return (0);
 
 	vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev);
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		zfs_range_tree_add(svr->svr_allocd_segs, offset, size);
 	else
 		zfs_range_tree_remove(svr->svr_allocd_segs, offset, size);
 
 	return (0);
 }
 
 static void
 claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
     uint64_t size, void *arg)
 {
 	(void) inner_offset, (void) arg;
 
 	/*
 	 * This callback was called through a remap from
 	 * a device being removed. Therefore, the vdev that
 	 * this callback is applied to is a concrete
 	 * vdev.
 	 */
 	ASSERT(vdev_is_concrete(vd));
 
 	VERIFY0(metaslab_claim_impl(vd, offset, size,
 	    spa_min_claim_txg(vd->vdev_spa)));
 }
 
 static void
 claim_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    claim_segment_impl_cb, NULL);
 }
 
 /*
  * After accounting for all allocated blocks that are directly referenced,
  * we might have missed a reference to a block from a partially complete
  * (and thus unused) indirect mapping object. We perform a secondary pass
  * through the metaslabs we have already mapped and claim the destination
  * blocks.
  */
 static void
 zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return;
 
 	if (spa->spa_vdev_removal == NULL)
 		return;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
 	zfs_range_tree_t *allocs = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    0, "zdb_claim_removing:allocs");
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		ASSERT0(zfs_range_tree_space(allocs));
 		if (msp->ms_sm != NULL)
 			VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC));
 		zfs_range_tree_vacate(allocs, zfs_range_tree_add,
 		    svr->svr_allocd_segs);
 	}
 	zfs_range_tree_destroy(allocs);
 
 	iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr);
 
 	/*
 	 * Clear everything past what has been synced,
 	 * because we have not allocated mappings for
 	 * it yet.
 	 */
 	zfs_range_tree_clear(svr->svr_allocd_segs,
 	    vdev_indirect_mapping_max_offset(vim),
 	    vd->vdev_asize - vdev_indirect_mapping_max_offset(vim));
 
 	zcb->zcb_removing_size += zfs_range_tree_space(svr->svr_allocd_segs);
 	zfs_range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 static int
 increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 	spa_t *spa = zcb->zcb_spa;
 	vdev_t *vd;
 	const dva_t *dva = &bp->blk_dva[0];
 
 	ASSERT(!bp_freed);
 	ASSERT(!dump_opt['L']);
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 1);
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva));
 	ASSERT3P(vd, !=, NULL);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL);
 
 	vdev_indirect_mapping_increment_obsolete_count(
 	    vd->vdev_indirect_mapping,
 	    DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva),
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 
 	return (0);
 }
 
 static uint32_t *
 zdb_load_obsolete_counts(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	spa_t *spa = vd->vdev_spa;
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	uint64_t obsolete_sm_object;
 	uint32_t *counts;
 
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL);
 	counts = vdev_indirect_mapping_load_obsolete_counts(vim);
 	if (vd->vdev_obsolete_sm != NULL) {
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    vd->vdev_obsolete_sm);
 	}
 	if (scip->scip_vdev == vd->vdev_id &&
 	    scip->scip_prev_obsolete_sm_object != 0) {
 		space_map_t *prev_obsolete_sm = NULL;
 		VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset,
 		    scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0));
 		vdev_indirect_mapping_load_obsolete_spacemap(vim, counts,
 		    prev_obsolete_sm);
 		space_map_close(prev_obsolete_sm);
 	}
 	return (counts);
 }
 
 typedef struct checkpoint_sm_exclude_entry_arg {
 	vdev_t *cseea_vd;
 	uint64_t cseea_checkpoint_size;
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
 checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
 	 * and the ms_sm space maps exist in the metaslab level,
 	 * an entry in the checkpoint space map could theoretically
 	 * cross the boundaries of the metaslab that it belongs.
 	 *
 	 * In reality, because of the way that we populate and
 	 * manipulate the checkpoint's space maps currently,
 	 * there shouldn't be any entries that cross metaslabs.
 	 * Hence the assertion below.
 	 *
 	 * That said, there is no fundamental requirement that
 	 * the checkpoint's space map entries should not cross
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * By removing the entry from the allocated segments we
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
 	zfs_range_tree_remove(ms->ms_allocatable, sme->sme_offset,
 	    sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
 static void
 zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb)
 {
 	spa_t *spa = vd->vdev_spa;
 	space_map_t *checkpoint_sm = NULL;
 	uint64_t checkpoint_sm_obj;
 
 	/*
 	 * If there is no vdev_top_zap, we are in a pool whose
 	 * version predates the pool checkpoint feature.
 	 */
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	/*
 	 * If there is no reference of the vdev_checkpoint_sm in
 	 * the vdev_top_zap, then one of the following scenarios
 	 * is true:
 	 *
 	 * 1] There is no checkpoint
 	 * 2] There is a checkpoint, but no checkpointed blocks
 	 *    have been freed yet
 	 * 3] The current vdev is indirect
 	 *
 	 * In these cases we return immediately.
 	 */
 	if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 		return;
 
 	VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1,
 	    &checkpoint_sm_obj));
 
 	checkpoint_sm_exclude_entry_arg_t cseea;
 	cseea.cseea_vd = vd;
 	cseea.cseea_checkpoint_size = 0;
 
 	VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 	    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 
 	VERIFY0(space_map_iterate(checkpoint_sm,
 	    space_map_length(checkpoint_sm),
 	    checkpoint_sm_exclude_entry_cb, &cseea));
 	space_map_close(checkpoint_sm);
 
 	zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size;
 }
 
 static void
 zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id);
 		zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb);
 	}
 }
 
 static int
 count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	int64_t *ualloc_space = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (sme->sme_type == SM_ALLOC)
 		*ualloc_space += sme->sme_run;
 	else
 		*ualloc_space -= sme->sme_run;
 
 	return (0);
 }
 
 static int64_t
 get_unflushed_alloc_space(spa_t *spa)
 {
 	if (dump_opt['L'])
 		return (0);
 
 	int64_t ualloc_space = 0;
 	iterate_through_spacemap_logs(spa, count_unflushed_space_cb,
 	    &ualloc_space);
 	return (ualloc_space);
 }
 
 static int
 load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg)
 {
 	maptype_t *uic_maptype = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t size = sme->sme_run;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 
 	/* skip indirect vdevs */
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 	ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 
 	if (*uic_maptype == sme->sme_type)
 		zfs_range_tree_add(ms->ms_allocatable, offset, size);
 	else
 		zfs_range_tree_remove(ms->ms_allocatable, offset, size);
 
 	return (0);
 }
 
 static void
 load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype)
 {
 	iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype);
 }
 
 static void
 load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		ASSERT3U(i, ==, vd->vdev_id);
 
 		if (vd->vdev_ops == &vdev_indirect_ops)
 			continue;
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rloading concrete vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)msp->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			mutex_enter(&msp->ms_lock);
 			zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 			/*
 			 * We don't want to spend the CPU manipulating the
 			 * size-ordered tree, so clear the range_tree ops.
 			 */
 			msp->ms_allocatable->rt_ops = NULL;
 
 			if (msp->ms_sm != NULL) {
 				VERIFY0(space_map_load(msp->ms_sm,
 				    msp->ms_allocatable, maptype));
 			}
 			if (!msp->ms_loaded)
 				msp->ms_loaded = B_TRUE;
 			mutex_exit(&msp->ms_lock);
 		}
 	}
 
 	load_unflushed_to_ms_allocatables(spa, maptype);
 }
 
 /*
  * vm_idxp is an in-out parameter which (for indirect vdevs) is the
  * index in vim_entries that has the first entry in this metaslab.
  * On return, it will be set to the first entry after this metaslab.
  */
 static void
 load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp,
     uint64_t *vim_idxp)
 {
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	mutex_enter(&msp->ms_lock);
 	zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL);
 
 	/*
 	 * We don't want to spend the CPU manipulating the
 	 * size-ordered tree, so clear the range_tree ops.
 	 */
 	msp->ms_allocatable->rt_ops = NULL;
 
 	for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim);
 	    (*vim_idxp)++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[*vim_idxp];
 		uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst);
 		ASSERT3U(ent_offset, >=, msp->ms_start);
 		if (ent_offset >= msp->ms_start + msp->ms_size)
 			break;
 
 		/*
 		 * Mappings do not cross metaslab boundaries,
 		 * because we create them by walking the metaslabs.
 		 */
 		ASSERT3U(ent_offset + ent_len, <=,
 		    msp->ms_start + msp->ms_size);
 		zfs_range_tree_add(msp->ms_allocatable, ent_offset, ent_len);
 	}
 
 	if (!msp->ms_loaded)
 		msp->ms_loaded = B_TRUE;
 	mutex_exit(&msp->ms_lock);
 }
 
 static void
 zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb)
 {
 	ASSERT(!dump_opt['L']);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		ASSERT3U(c, ==, vd->vdev_id);
 
 		if (vd->vdev_ops != &vdev_indirect_ops)
 			continue;
 
 		/*
 		 * Note: we don't check for mapping leaks on
 		 * removing vdevs because their ms_allocatable's
 		 * are used to look for leaks in allocated space.
 		 */
 		zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd);
 
 		/*
 		 * Normally, indirect vdevs don't have any
 		 * metaslabs.  We want to set them up for
 		 * zio_claim().
 		 */
 		vdev_metaslab_group_create(vd);
 		VERIFY0(vdev_metaslab_init(vd, 0));
 
 		vdev_indirect_mapping_t *vim __maybe_unused =
 		    vd->vdev_indirect_mapping;
 		uint64_t vim_idx = 0;
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 
 			(void) fprintf(stderr,
 			    "\rloading indirect vdev %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)vd->vdev_id,
 			    (longlong_t)vd->vdev_ms[m]->ms_id,
 			    (longlong_t)vd->vdev_ms_count);
 
 			load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m],
 			    &vim_idx);
 		}
 		ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim));
 	}
 }
 
 static void
 zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
 {
 	zcb->zcb_spa = spa;
 
 	if (dump_opt['L'])
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We are going to be changing the meaning of the metaslab's
 	 * ms_allocatable.  Ensure that the allocator doesn't try to
 	 * use the tree.
 	 */
 	spa->spa_normal_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_log_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops;
 	spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops;
 
 	zcb->zcb_vd_obsolete_counts =
 	    umem_zalloc(rvd->vdev_children * sizeof (uint32_t *),
 	    UMEM_NOFAIL);
 
 	/*
 	 * For leak detection, we overload the ms_allocatable trees
 	 * to contain allocated segments instead of free segments.
 	 * As a result, we can't use the normal metaslab_load/unload
 	 * interfaces.
 	 */
 	zdb_leak_init_prepare_indirect_vdevs(spa, zcb);
 	load_concrete_ms_allocatable_trees(spa, SM_ALLOC);
 
 	/*
 	 * On load_concrete_ms_allocatable_trees() we loaded all the
 	 * allocated entries from the ms_sm to the ms_allocatable for
 	 * each metaslab. If the pool has a checkpoint or is in the
 	 * middle of discarding a checkpoint, some of these blocks
 	 * may have been freed but their ms_sm may not have been
 	 * updated because they are referenced by the checkpoint. In
 	 * order to avoid false-positives during leak-detection, we
 	 * go through the vdev's checkpoint space map and exclude all
 	 * its entries from their relevant ms_allocatable.
 	 *
 	 * We also aggregate the space held by the checkpoint and add
 	 * it to zcb_checkpoint_size.
 	 *
 	 * Note that at this point we are also verifying that all the
 	 * entries on the checkpoint_sm are marked as allocated in
 	 * the ms_sm of their relevant metaslab.
 	 * [see comment in checkpoint_sm_exclude_entry_cb()]
 	 */
 	zdb_leak_init_exclude_checkpoint(spa, zcb);
 	ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa));
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 
 	if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 		ASSERT(spa_feature_is_enabled(spa,
 		    SPA_FEATURE_DEVICE_REMOVAL));
 		(void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj,
 		    increment_indirect_mapping_cb, zcb, NULL);
 	}
 }
 
 static boolean_t
 zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb)
 {
 	boolean_t leaks = B_FALSE;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t total_leaked = 0;
 	boolean_t are_precise = B_FALSE;
 
 	ASSERT(vim != NULL);
 
 	for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) {
 		vdev_indirect_mapping_entry_phys_t *vimep =
 		    &vim->vim_entries[i];
 		uint64_t obsolete_bytes = 0;
 		uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep);
 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 
 		/*
 		 * This is not very efficient but it's easy to
 		 * verify correctness.
 		 */
 		for (uint64_t inner_offset = 0;
 		    inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst);
 		    inner_offset += 1ULL << vd->vdev_ashift) {
 			if (zfs_range_tree_contains(msp->ms_allocatable,
 			    offset + inner_offset, 1ULL << vd->vdev_ashift)) {
 				obsolete_bytes += 1ULL << vd->vdev_ashift;
 			}
 		}
 
 		int64_t bytes_leaked = obsolete_bytes -
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i];
 		ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=,
 		    zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]);
 
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) {
 			(void) printf("obsolete indirect mapping count "
 			    "mismatch on %llu:%llx:%llx : %llx bytes leaked\n",
 			    (u_longlong_t)vd->vdev_id,
 			    (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep),
 			    (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst),
 			    (u_longlong_t)bytes_leaked);
 		}
 		total_leaked += ABS(bytes_leaked);
 	}
 
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (!are_precise && total_leaked > 0) {
 		int pct_leaked = total_leaked * 100 /
 		    vdev_indirect_mapping_bytes_mapped(vim);
 		(void) printf("cannot verify obsolete indirect mapping "
 		    "counts of vdev %llu because precise feature was not "
 		    "enabled when it was removed: %d%% (%llx bytes) of mapping"
 		    "unreferenced\n",
 		    (u_longlong_t)vd->vdev_id, pct_leaked,
 		    (u_longlong_t)total_leaked);
 	} else if (total_leaked > 0) {
 		(void) printf("obsolete indirect mapping count mismatch "
 		    "for vdev %llu -- %llx total bytes mismatched\n",
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)total_leaked);
 		leaks |= B_TRUE;
 	}
 
 	vdev_indirect_mapping_free_obsolete_counts(vim,
 	    zcb->zcb_vd_obsolete_counts[vd->vdev_id]);
 	zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL;
 
 	return (leaks);
 }
 
 static boolean_t
 zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
 {
 	if (dump_opt['L'])
 		return (B_FALSE);
 
 	boolean_t leaks = B_FALSE;
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (unsigned c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		if (zcb->zcb_vd_obsolete_counts[c] != NULL) {
 			leaks |= zdb_check_for_obsolete_leaks(vd, zcb);
 		}
 
 		for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class ==
 			    spa_embedded_log_class(spa) ||
 			    msp->ms_group->mg_class ==
 			    spa_special_embedded_log_class(spa)) ?
 			    vd->vdev_log_mg : vd->vdev_mg);
 
 			/*
 			 * ms_allocatable has been overloaded
 			 * to contain allocated segments. Now that
 			 * we finished traversing all blocks, any
 			 * block that remains in the ms_allocatable
 			 * represents an allocated block that we
 			 * did not claim during the traversal.
 			 * Claimed blocks would have been removed
 			 * from the ms_allocatable.  For indirect
 			 * vdevs, space remaining in the tree
 			 * represents parts of the mapping that are
 			 * not referenced, which is not a bug.
 			 */
 			if (vd->vdev_ops == &vdev_indirect_ops) {
 				zfs_range_tree_vacate(msp->ms_allocatable,
 				    NULL, NULL);
 			} else {
 				zfs_range_tree_vacate(msp->ms_allocatable,
 				    zdb_leak, vd);
 			}
 			if (msp->ms_loaded) {
 				msp->ms_loaded = B_FALSE;
 			}
 		}
 	}
 
 	umem_free(zcb->zcb_vd_obsolete_counts,
 	    rvd->vdev_children * sizeof (uint32_t *));
 	zcb->zcb_vd_obsolete_counts = NULL;
 
 	return (leaks);
 }
 
 static int
 count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	(void) tx;
 	zdb_cb_t *zcb = arg;
 
 	if (dump_opt['b'] >= 5) {
 		char blkbuf[BP_SPRINTF_LEN];
 		snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 		(void) printf("[%s] %s\n",
 		    "deferred free", blkbuf);
 	}
 	zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED);
 	return (0);
 }
 
 /*
  * Iterate over livelists which have been destroyed by the user but
  * are still present in the MOS, waiting to be freed
  */
 static void
 iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	ASSERT0(err);
 
 	zap_cursor_t zc;
 	zap_attribute_t *attrp = zap_attribute_alloc();
 	dsl_deadlist_t ll;
 	/* NULL out os prior to dsl_deadlist_open in case it's garbage */
 	ll.dl_os = NULL;
 	for (zap_cursor_init(&zc, mos, zap_obj);
 	    zap_cursor_retrieve(&zc, attrp) == 0;
 	    (void) zap_cursor_advance(&zc)) {
 		VERIFY0(dsl_deadlist_open(&ll, mos, attrp->za_first_integer));
 		func(&ll, arg);
 		dsl_deadlist_close(&ll);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(attrp);
 }
 
 static int
 bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (count_block_cb(arg, bp, tx));
 }
 
 static int
 livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle)
 {
 	zdb_cb_t *zbc = args;
 	bplist_t blks;
 	bplist_create(&blks);
 	/* determine which blocks have been alloc'd but not freed */
 	VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL));
 	/* count those blocks */
 	(void) bplist_iterate(&blks, count_block_cb, zbc, NULL);
 	bplist_destroy(&blks);
 	return (0);
 }
 
 static void
 livelist_count_blocks(dsl_deadlist_t *ll, void *arg)
 {
 	dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg);
 }
 
 /*
  * Count the blocks in the livelists that have been destroyed by the user
  * but haven't yet been freed.
  */
 static void
 deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc)
 {
 	iterate_deleted_livelists(spa, livelist_count_blocks, zbc);
 }
 
 static void
 dump_livelist_cb(dsl_deadlist_t *ll, void *arg)
 {
 	ASSERT0P(arg);
 	global_feature_count[SPA_FEATURE_LIVELIST]++;
 	dump_blkptr_list(ll, "Deleted Livelist");
 	dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL);
 }
 
 /*
  * Print out, register object references to, and increment feature counts for
  * livelists that have been destroyed by the user but haven't yet been freed.
  */
 static void
 deleted_livelists_dump_mos(spa_t *spa)
 {
 	uint64_t zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj);
 	if (err == ENOENT)
 		return;
 	mos_obj_refd(zap_obj);
 	iterate_deleted_livelists(spa, dump_livelist_cb, NULL);
 }
 
 static int
 zdb_brt_entry_compare(const void *zcn1, const void *zcn2)
 {
 	const dva_t *dva1 = &((const zdb_brt_entry_t *)zcn1)->zbre_dva;
 	const dva_t *dva2 = &((const zdb_brt_entry_t *)zcn2)->zbre_dva;
 	int cmp;
 
 	cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2));
 	if (cmp == 0)
 		cmp = TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2));
 
 	return (cmp);
 }
 
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t *zcb;
 	zdb_blkstats_t *zb, *tzb;
 	uint64_t norm_alloc, norm_space, total_alloc, total_found;
 	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT | TRAVERSE_HARD;
 	boolean_t leaks = B_FALSE;
 	int e, c, err;
 	bp_embedded_type_t i;
 
 	ddt_prefetch_all(spa);
 
 	zcb = umem_zalloc(sizeof (zdb_cb_t), UMEM_NOFAIL);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_BLOCK_CLONING)) {
 		avl_create(&zcb->zcb_brt, zdb_brt_entry_compare,
 		    sizeof (zdb_brt_entry_t),
 		    offsetof(zdb_brt_entry_t, zbre_node));
 		zcb->zcb_brt_is_active = B_TRUE;
 	}
 
 	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n",
 	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
 	    (dump_opt['c'] == 1) ? "metadata " : "",
 	    dump_opt['c'] ? "checksums " : "",
 	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
 	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * When leak detection is enabled we load all space maps as SM_ALLOC
 	 * maps, then traverse the pool claiming each block we discover. If
 	 * the pool is perfectly consistent, the segment trees will be empty
 	 * when we're done. Anything left over is a leak; any block we can't
 	 * claim (because it's not part of any space map) is a double
 	 * allocation, reference to a freed block, or an unclaimed log block.
 	 *
 	 * When leak detection is disabled (-L option) we still traverse the
 	 * pool claiming each block we discover, but we skip opening any space
 	 * maps.
 	 */
 	zdb_leak_init(spa, zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
 	(void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj,
 	    bpobj_count_block_cb, zcb, NULL);
 
 	if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 		(void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj,
 		    bpobj_count_block_cb, zcb, NULL);
 	}
 
 	zdb_claim_removing(spa, zcb);
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
 		VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset,
 		    spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb,
 		    zcb, NULL));
 	}
 
 	deleted_livelists_count_blocks(spa, zcb);
 
 	if (dump_opt['c'] > 1)
 		flags |= TRAVERSE_PREFETCH_DATA;
 
 	zcb->zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa));
 	zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa));
 	zcb->zcb_totalasize +=
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa));
 	zcb->zcb_totalasize +=
 	    metaslab_class_get_alloc(spa_special_embedded_log_class(spa));
 	zcb->zcb_start = zcb->zcb_lastprint = gethrtime();
 	err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb);
 
 	/*
 	 * If we've traversed the data blocks then we need to wait for those
 	 * I/Os to complete. We leverage "The Godfather" zio to wait on
 	 * all async I/Os to complete.
 	 */
 	if (dump_opt['c']) {
 		for (c = 0; c < max_ncpus; c++) {
 			(void) zio_wait(spa->spa_async_zio_root[c]);
 			spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 			    ZIO_FLAG_GODFATHER);
 		}
 	}
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	/*
 	 * Done after zio_wait() since zcb_haderrors is modified in
 	 * zdb_blkptr_done()
 	 */
 	zcb->zcb_haderrors |= err;
 
 	if (zcb->zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
 		for (e = 0; e < 256; e++) {
 			if (zcb->zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb->zcb_errors[e]);
 			}
 		}
 	}
 
 	/*
 	 * Report any leaked segments.
 	 */
 	leaks |= zdb_leak_fini(spa, zcb);
 
 	tzb = &zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
 	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
 	total_alloc = norm_alloc +
 	    metaslab_class_get_alloc(spa_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) +
 	    metaslab_class_get_alloc(spa_special_class(spa)) +
 	    metaslab_class_get_alloc(spa_dedup_class(spa)) +
 	    get_unflushed_alloc_space(spa);
 	total_found =
 	    tzb->zb_asize - zcb->zcb_dedup_asize - zcb->zcb_clone_asize +
 	    zcb->zcb_removing_size + zcb->zcb_checkpoint_size;
 
 	if (total_found == total_alloc && !dump_opt['L']) {
 		(void) printf("\n\tNo leaks (block sum matches space"
 		    " maps exactly)\n");
 	} else if (!dump_opt['L']) {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
 		    (u_longlong_t)total_found,
 		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
 		    (longlong_t)(total_alloc - total_found));
 	}
 
 	if (tzb->zb_count == 0) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	(void) printf("\n");
 	(void) printf("\t%-16s %14llu\n", "bp count:",
 	    (u_longlong_t)tzb->zb_count);
 	(void) printf("\t%-16s %14llu\n", "ganged count:",
 	    (longlong_t)tzb->zb_gangs);
 	(void) printf("\t%-16s %14llu      avg: %6llu\n", "bp logical:",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp physical:", (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
 	(void) printf("\t%-16s %14llu      avg: %6llu     compression: %6.2f\n",
 	    "bp allocated:", (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
 	(void) printf("\t%-16s %14llu    ref>1: %6llu   deduplication: %6.2f\n",
 	    "bp deduped:", (u_longlong_t)zcb->zcb_dedup_asize,
 	    (u_longlong_t)zcb->zcb_dedup_blocks,
 	    (double)zcb->zcb_dedup_asize / tzb->zb_asize + 1.0);
 	(void) printf("\t%-16s %14llu    count: %6llu\n",
 	    "bp cloned:", (u_longlong_t)zcb->zcb_clone_asize,
 	    (u_longlong_t)zcb->zcb_clone_blocks);
 	(void) printf("\t%-16s %14llu     used: %5.2f%%\n", "Normal class:",
 	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	if (spa_special_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_special_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_special_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Special class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_dedup_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_dedup_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_dedup_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Dedup class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_embedded_log_class(spa)->mc_allocator[0].mca_rotor != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_embedded_log_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Embedded log class", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor
 	    != NULL) {
 		uint64_t alloc = metaslab_class_get_alloc(
 		    spa_special_embedded_log_class(spa));
 		uint64_t space = metaslab_class_get_space(
 		    spa_special_embedded_log_class(spa));
 
 		(void) printf("\t%-16s %14llu     used: %5.2f%%\n",
 		    "Special embedded log", (u_longlong_t)alloc,
 		    100.0 * alloc / space);
 	}
 
 	for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) {
 		if (zcb->zcb_embedded_blocks[i] == 0)
 			continue;
 		(void) printf("\n");
 		(void) printf("\tadditional, non-pointer bps of type %u: "
 		    "%10llu\n",
 		    i, (u_longlong_t)zcb->zcb_embedded_blocks[i]);
 
 		if (dump_opt['b'] >= 3) {
 			(void) printf("\t number of (compressed) bytes:  "
 			    "number of bps\n");
 			dump_histogram(zcb->zcb_embedded_histogram[i],
 			    sizeof (zcb->zcb_embedded_histogram[i]) /
 			    sizeof (zcb->zcb_embedded_histogram[i][0]), 0);
 		}
 	}
 
 	if (tzb->zb_ditto_samevdev != 0) {
 		(void) printf("\tDittoed blocks on same vdev: %llu\n",
 		    (longlong_t)tzb->zb_ditto_samevdev);
 	}
 	if (tzb->zb_ditto_same_ms != 0) {
 		(void) printf("\tDittoed blocks in same metaslab: %llu\n",
 		    (longlong_t)tzb->zb_ditto_same_ms);
 	}
 
 	for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[v];
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		if (vim == NULL) {
 			continue;
 		}
 
 		char mem[32];
 		zdb_nicenum(vdev_indirect_mapping_num_entries(vim),
 		    mem, vdev_indirect_mapping_size(vim));
 
 		(void) printf("\tindirect vdev id %llu has %llu segments "
 		    "(%s in memory)\n",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vdev_indirect_mapping_num_entries(vim), mem);
 	}
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		char csize[32], lsize[32], psize[32], asize[32];
 		char avg[32], gang[32];
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
 		zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t),
 		    UMEM_NOFAIL);
 
 		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			const char *typename;
 
 			/* make sure nicenum has enough space */
 			_Static_assert(sizeof (csize) >= NN_NUMBUF_SZ,
 			    "csize truncated");
 			_Static_assert(sizeof (lsize) >= NN_NUMBUF_SZ,
 			    "lsize truncated");
 			_Static_assert(sizeof (psize) >= NN_NUMBUF_SZ,
 			    "psize truncated");
 			_Static_assert(sizeof (asize) >= NN_NUMBUF_SZ,
 			    "asize truncated");
 			_Static_assert(sizeof (avg) >= NN_NUMBUF_SZ,
 			    "avg truncated");
 			_Static_assert(sizeof (gang) >= NN_NUMBUF_SZ,
 			    "gang truncated");
 
 			if (t < DMU_OT_NUMTYPES)
 				typename = dmu_ot[t].ot_name;
 			else
 				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb->zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
 				    "\t%5s\t%5s\t%6s\t%s\n",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    "-",
 				    typename);
 				continue;
 			}
 
 			for (l = ZB_TOTAL - 1; l >= -1; l--) {
 				level = (l == -1 ? ZB_TOTAL : l);
 				zb = &zcb->zcb_type[level][t];
 
 				if (zb->zb_asize == 0)
 					continue;
 
 				if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES &&
 				    (level > 0 || DMU_OT_IS_METADATA(t))) {
 					mdstats->zb_count += zb->zb_count;
 					mdstats->zb_lsize += zb->zb_lsize;
 					mdstats->zb_psize += zb->zb_psize;
 					mdstats->zb_asize += zb->zb_asize;
 					mdstats->zb_gangs += zb->zb_gangs;
 				}
 
 				if (dump_opt['b'] < 3 && level != ZB_TOTAL)
 					continue;
 
 				if (level == 0 && zb->zb_asize ==
 				    zcb->zcb_type[ZB_TOTAL][t].zb_asize)
 					continue;
 
 				zdb_nicenum(zb->zb_count, csize,
 				    sizeof (csize));
 				zdb_nicenum(zb->zb_lsize, lsize,
 				    sizeof (lsize));
 				zdb_nicenum(zb->zb_psize, psize,
 				    sizeof (psize));
 				zdb_nicenum(zb->zb_asize, asize,
 				    sizeof (asize));
 				zdb_nicenum(zb->zb_asize / zb->zb_count, avg,
 				    sizeof (avg));
 				zdb_nicenum(zb->zb_gangs, gang, sizeof (gang));
 
 				(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 				    "\t%5.2f\t%6.2f\t",
 				    csize, lsize, psize, asize, avg,
 				    (double)zb->zb_lsize / zb->zb_psize,
 				    100.0 * zb->zb_asize / tzb->zb_asize);
 
 				if (level == ZB_TOTAL)
 					(void) printf("%s\n", typename);
 				else
 					(void) printf("    L%d %s\n",
 					    level, typename);
 
 				if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) {
 					(void) printf("\t number of ganged "
 					    "blocks: %s\n", gang);
 				}
 
 				if (dump_opt['b'] >= 4) {
 					(void) printf("psize "
 					    "(in 512-byte sectors): "
 					    "number of blocks\n");
 					dump_histogram(zb->zb_psize_histogram,
 					    PSIZE_HISTO_SIZE, 0);
 				}
 			}
 		}
 		zdb_nicenum(mdstats->zb_count, csize,
 		    sizeof (csize));
 		zdb_nicenum(mdstats->zb_lsize, lsize,
 		    sizeof (lsize));
 		zdb_nicenum(mdstats->zb_psize, psize,
 		    sizeof (psize));
 		zdb_nicenum(mdstats->zb_asize, asize,
 		    sizeof (asize));
 		zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg,
 		    sizeof (avg));
 		zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang));
 
 		(void) printf("%6s\t%5s\t%5s\t%5s\t%5s"
 		    "\t%5.2f\t%6.2f\t",
 		    csize, lsize, psize, asize, avg,
 		    (double)mdstats->zb_lsize / mdstats->zb_psize,
 		    100.0 * mdstats->zb_asize / tzb->zb_asize);
 		(void) printf("%s\n", "Metadata Total");
 
 		/* Output a table summarizing block sizes in the pool */
 		if (dump_opt['b'] >= 2) {
 			dump_size_histograms(zcb);
 		}
 
 		umem_free(mdstats, sizeof (zfs_blkstat_t));
 	}
 
 	(void) printf("\n");
 
 	if (leaks) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (2);
 	}
 
 	if (zcb->zcb_haderrors) {
 		umem_free(zcb, sizeof (zdb_cb_t));
 		return (3);
 	}
 
 	umem_free(zcb, sizeof (zdb_cb_t));
 	return (0);
 }
 
 typedef struct zdb_ddt_entry {
 	/* key must be first for ddt_key_compare */
 	ddt_key_t	zdde_key;
 	uint64_t	zdde_ref_blocks;
 	uint64_t	zdde_ref_lsize;
 	uint64_t	zdde_ref_psize;
 	uint64_t	zdde_ref_dsize;
 	avl_node_t	zdde_node;
 } zdb_ddt_entry_t;
 
 static int
 zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	(void) zilog, (void) dnp;
 	avl_tree_t *t = arg;
 	avl_index_t where;
 	zdb_ddt_entry_t *zdde, zdde_search;
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp))
 		return (0);
 
 	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
 		(void) printf("traversing objset %llu, %llu objects, "
 		    "%lu blocks so far\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)BP_GET_FILL(bp),
 		    avl_numnodes(t));
 	}
 
 	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
 	    BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 		return (0);
 
 	ddt_key_fill(&zdde_search.zdde_key, bp);
 
 	zdde = avl_find(t, &zdde_search, &where);
 
 	if (zdde == NULL) {
 		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
 		zdde->zdde_key = zdde_search.zdde_key;
 		avl_insert(t, zdde, where);
 	}
 
 	zdde->zdde_ref_blocks += 1;
 	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
 	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
 	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
 
 	return (0);
 }
 
 static void
 dump_simulated_ddt(spa_t *spa)
 {
 	avl_tree_t t;
 	void *cookie = NULL;
 	zdb_ddt_entry_t *zdde;
 	ddt_histogram_t ddh_total = {{{0}}};
 	ddt_stat_t dds_total = {0};
 
 	avl_create(&t, ddt_key_compare,
 	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 	    TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
 		uint64_t refcnt = zdde->zdde_ref_blocks;
 		ASSERT(refcnt != 0);
 
 		ddt_stat_t *dds = &ddh_total.ddh_stat[highbit64(refcnt) - 1];
 
 		dds->dds_blocks += zdde->zdde_ref_blocks / refcnt;
 		dds->dds_lsize += zdde->zdde_ref_lsize / refcnt;
 		dds->dds_psize += zdde->zdde_ref_psize / refcnt;
 		dds->dds_dsize += zdde->zdde_ref_dsize / refcnt;
 
 		dds->dds_ref_blocks += zdde->zdde_ref_blocks;
 		dds->dds_ref_lsize += zdde->zdde_ref_lsize;
 		dds->dds_ref_psize += zdde->zdde_ref_psize;
 		dds->dds_ref_dsize += zdde->zdde_ref_dsize;
 
 		umem_free(zdde, sizeof (*zdde));
 	}
 
 	avl_destroy(&t);
 
 	ddt_histogram_total(&dds_total, &ddh_total);
 
 	(void) printf("Simulated DDT histogram:\n");
 
 	zpool_dump_ddt(&dds_total, &ddh_total);
 
 	dump_dedup_ratio(&dds_total);
 }
 
 static int
 verify_device_removal_feature_counts(spa_t *spa)
 {
 	uint64_t dr_feature_refcount = 0;
 	uint64_t oc_feature_refcount = 0;
 	uint64_t indirect_vdev_count = 0;
 	uint64_t precise_vdev_count = 0;
 	uint64_t obsolete_counts_object_count = 0;
 	uint64_t obsolete_sm_count = 0;
 	uint64_t obsolete_counts_count = 0;
 	uint64_t scip_count = 0;
 	uint64_t obsolete_bpobj_count = 0;
 	int ret = 0;
 
 	spa_condensing_indirect_phys_t *scip =
 	    &spa->spa_condensing_indirect_phys;
 	if (scip->scip_next_mapping_object != 0) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev];
 		ASSERT(scip->scip_prev_obsolete_sm_object != 0);
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 		(void) printf("Condensing indirect vdev %llu: new mapping "
 		    "object %llu, prev obsolete sm %llu\n",
 		    (u_longlong_t)scip->scip_vdev,
 		    (u_longlong_t)scip->scip_next_mapping_object,
 		    (u_longlong_t)scip->scip_prev_obsolete_sm_object);
 		if (scip->scip_prev_obsolete_sm_object != 0) {
 			space_map_t *prev_obsolete_sm = NULL;
 			VERIFY0(space_map_open(&prev_obsolete_sm,
 			    spa->spa_meta_objset,
 			    scip->scip_prev_obsolete_sm_object,
 			    0, vd->vdev_asize, 0));
 			dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm);
 			(void) printf("\n");
 			space_map_close(prev_obsolete_sm);
 		}
 
 		scip_count += 2;
 	}
 
 	for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (vic->vic_mapping_object != 0) {
 			ASSERT(vd->vdev_ops == &vdev_indirect_ops ||
 			    vd->vdev_removing);
 			indirect_vdev_count++;
 
 			if (vd->vdev_indirect_mapping->vim_havecounts) {
 				obsolete_counts_count++;
 			}
 		}
 
 		boolean_t are_precise;
 		VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		if (are_precise) {
 			ASSERT(vic->vic_mapping_object != 0);
 			precise_vdev_count++;
 		}
 
 		uint64_t obsolete_sm_object;
 		VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 		if (obsolete_sm_object != 0) {
 			ASSERT(vic->vic_mapping_object != 0);
 			obsolete_sm_count++;
 		}
 	}
 
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL],
 	    &dr_feature_refcount);
 	(void) feature_get_refcount(spa,
 	    &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS],
 	    &oc_feature_refcount);
 
 	if (dr_feature_refcount != indirect_vdev_count) {
 		ret = 1;
 		(void) printf("Number of indirect vdevs (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)indirect_vdev_count,
 		    (u_longlong_t)dr_feature_refcount);
 	} else {
 		(void) printf("Verified device_removal feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)dr_feature_refcount);
 	}
 
 	if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_OBSOLETE_BPOBJ) == 0) {
 		obsolete_bpobj_count++;
 	}
 
 
 	obsolete_counts_object_count = precise_vdev_count;
 	obsolete_counts_object_count += obsolete_sm_count;
 	obsolete_counts_object_count += obsolete_counts_count;
 	obsolete_counts_object_count += scip_count;
 	obsolete_counts_object_count += obsolete_bpobj_count;
 	obsolete_counts_object_count += remap_deadlist_count;
 
 	if (oc_feature_refcount != obsolete_counts_object_count) {
 		ret = 1;
 		(void) printf("Number of obsolete counts objects (%llu) " \
 		    "does not match feature count (%llu)\n",
 		    (u_longlong_t)obsolete_counts_object_count,
 		    (u_longlong_t)oc_feature_refcount);
 		(void) printf("pv:%llu os:%llu oc:%llu sc:%llu "
 		    "ob:%llu rd:%llu\n",
 		    (u_longlong_t)precise_vdev_count,
 		    (u_longlong_t)obsolete_sm_count,
 		    (u_longlong_t)obsolete_counts_count,
 		    (u_longlong_t)scip_count,
 		    (u_longlong_t)obsolete_bpobj_count,
 		    (u_longlong_t)remap_deadlist_count);
 	} else {
 		(void) printf("Verified indirect_refcount feature refcount " \
 		    "of %llu is correct\n",
 		    (u_longlong_t)oc_feature_refcount);
 	}
 	return (ret);
 }
 
 static void
 zdb_set_skip_mmp(char *target)
 {
 	spa_t *spa;
 
 	/*
 	 * Disable the activity check to allow examination of
 	 * active pools.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(target)) != NULL) {
 		spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP;
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 #define	BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE"
 /*
  * Import the checkpointed state of the pool specified by the target
  * parameter as readonly. The function also accepts a pool config
  * as an optional parameter, else it attempts to infer the config by
  * the name of the target pool.
  *
  * Note that the checkpointed state's pool name will be the name of
  * the original pool with the above suffix appended to it. In addition,
  * if the target is not a pool name (e.g. a path to a dataset) then
  * the new_path parameter is populated with the updated path to
  * reflect the fact that we are looking into the checkpointed state.
  *
  * The function returns a newly-allocated copy of the name of the
  * pool containing the checkpointed state. When this copy is no
  * longer needed it should be freed with free(3C). Same thing
  * applies to the new_path parameter if allocated.
  */
 static char *
 import_checkpointed_state(char *target, nvlist_t *cfg, boolean_t target_is_spa,
     char **new_path)
 {
 	int error = 0;
 	char *poolname, *bogus_name = NULL;
 	boolean_t freecfg = B_FALSE;
 
 	/* If the target is not a pool, the extract the pool name */
 	char *path_start = strchr(target, '/');
 	if (target_is_spa || path_start == NULL) {
 		poolname = target;
 	} else {
 		size_t poolname_len = path_start - target;
 		poolname = strndup(target, poolname_len);
 	}
 
 	if (cfg == NULL) {
 		zdb_set_skip_mmp(poolname);
 		error = spa_get_stats(poolname, &cfg, NULL, 0);
 		if (error != 0) {
 			fatal("Tried to read config of pool \"%s\" but "
 			    "spa_get_stats() failed with error %d\n",
 			    poolname, error);
 		}
 		freecfg = B_TRUE;
 	}
 
 	if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) {
 		if (target != poolname)
 			free(poolname);
 		return (NULL);
 	}
 	fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name);
 
 	error = spa_import(bogus_name, cfg, NULL,
 	    ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT |
 	    ZFS_IMPORT_SKIP_MMP);
 	if (freecfg)
 		nvlist_free(cfg);
 	if (error != 0) {
 		fatal("Tried to import pool \"%s\" but spa_import() failed "
 		    "with error %d\n", bogus_name, error);
 	}
 
 	if (new_path != NULL && !target_is_spa) {
 		if (asprintf(new_path, "%s%s", bogus_name,
 		    path_start != NULL ? path_start : "") == -1) {
 			free(bogus_name);
 			if (!target_is_spa && path_start != NULL)
 				free(poolname);
 			return (NULL);
 		}
 	}
 
 	if (target != poolname)
 		free(poolname);
 
 	return (bogus_name);
 }
 
 typedef struct verify_checkpoint_sm_entry_cb_arg {
 	vdev_t *vcsec_vd;
 
 	/* the following fields are only used for printing progress */
 	uint64_t vcsec_entryid;
 	uint64_t vcsec_num_entries;
 } verify_checkpoint_sm_entry_cb_arg_t;
 
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
 verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
 	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
 	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
 		    "\rverifying vdev %llu, space map entry %llu of %llu ...",
 		    (longlong_t)vd->vdev_id,
 		    (longlong_t)vcsec->vcsec_entryid,
 		    (longlong_t)vcsec->vcsec_num_entries);
 	}
 	vcsec->vcsec_entryid++;
 
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
 	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
 	 * The entries in the vdev_checkpoint_sm should be marked as
 	 * allocated in the checkpointed state of the pool, therefore
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
 	zfs_range_tree_verify_not_present(ms->ms_allocatable,
 	    sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
 }
 
 /*
  * Verify that all segments in the vdev_checkpoint_sm are allocated
  * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's
  * ms_allocatable).
  *
  * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of
  * each vdev in the current state of the pool to the metaslab space maps
  * (ms_sm) of the checkpointed state of the pool.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of the current spa_t. The entries of these ms_allocatable
  * trees are cleared out and then repopulated from with the free
  * entries of their respective ms_sm space maps.
  */
 static void
 verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_FREE);
 
 	for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c];
 		vdev_t *current_vd = current_rvd->vdev_child[c];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * Since we don't allow device removal in a pool
 			 * that has a checkpoint, we expect that all removed
 			 * vdevs were removed from the pool before the
 			 * checkpoint.
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		/*
 		 * If the checkpoint space map doesn't exist, then nothing
 		 * here is checkpointed so there's nothing to verify.
 		 */
 		if (current_vd->vdev_top_zap == 0 ||
 		    zap_contains(spa_meta_objset(current),
 		    current_vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(current),
 		    current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current),
 		    checkpoint_sm_obj, 0, current_vd->vdev_asize,
 		    current_vd->vdev_ashift));
 
 		verify_checkpoint_sm_entry_cb_arg_t vcsec;
 		vcsec.vcsec_vd = ckpoint_vd;
 		vcsec.vcsec_entryid = 0;
 		vcsec.vcsec_num_entries =
 		    space_map_length(checkpoint_sm) / sizeof (uint64_t);
 		VERIFY0(space_map_iterate(checkpoint_sm,
 		    space_map_length(checkpoint_sm),
 		    verify_checkpoint_sm_entry_cb, &vcsec));
 		if (dump_opt['m'] > 3)
 			dump_spacemap(current->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 
 	/*
 	 * If we've added vdevs since we took the checkpoint, ensure
 	 * that their checkpoint space maps are empty.
 	 */
 	if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) {
 		for (uint64_t c = ckpoint_rvd->vdev_children;
 		    c < current_rvd->vdev_children; c++) {
 			vdev_t *current_vd = current_rvd->vdev_child[c];
 			VERIFY0P(current_vd->vdev_checkpoint_sm);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 /*
  * Verifies that all space that's allocated in the checkpoint is
  * still allocated in the current version, by checking that everything
  * in checkpoint's ms_allocatable (which is actually allocated, not
  * allocatable/free) is not present in current's ms_allocatable.
  *
  * Note that the function changes the state of the ms_allocatable
  * trees of both spas when called. The entries of all ms_allocatable
  * trees are cleared out and then repopulated from their respective
  * ms_sm space maps. In the checkpointed state we load the allocated
  * entries, and in the current state we load the free entries.
  */
 static void
 verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current)
 {
 	vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev;
 	vdev_t *current_rvd = current->spa_root_vdev;
 
 	load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC);
 	load_concrete_ms_allocatable_trees(current, SM_FREE);
 
 	for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) {
 		vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i];
 		vdev_t *current_vd = current_rvd->vdev_child[i];
 
 		if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) {
 			/*
 			 * See comment in verify_checkpoint_vdev_spacemaps()
 			 */
 			ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops);
 			continue;
 		}
 
 		for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) {
 			metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m];
 			metaslab_t *current_msp = current_vd->vdev_ms[m];
 
 			(void) fprintf(stderr,
 			    "\rverifying vdev %llu of %llu, "
 			    "metaslab %llu of %llu ...",
 			    (longlong_t)current_vd->vdev_id,
 			    (longlong_t)current_rvd->vdev_children,
 			    (longlong_t)current_vd->vdev_ms[m]->ms_id,
 			    (longlong_t)current_vd->vdev_ms_count);
 
 			/*
 			 * We walk through the ms_allocatable trees that
 			 * are loaded with the allocated blocks from the
 			 * ms_sm spacemaps of the checkpoint. For each
 			 * one of these ranges we ensure that none of them
 			 * exists in the ms_allocatable trees of the
 			 * current state which are loaded with the ranges
 			 * that are currently free.
 			 *
 			 * This way we ensure that none of the blocks that
 			 * are part of the checkpoint were freed by mistake.
 			 */
 			zfs_range_tree_walk(ckpoint_msp->ms_allocatable,
 			    (zfs_range_tree_func_t *)
 			    zfs_range_tree_verify_not_present,
 			    current_msp->ms_allocatable);
 		}
 	}
 
 	/* for cleaner progress output */
 	(void) fprintf(stderr, "\n");
 }
 
 static void
 verify_checkpoint_blocks(spa_t *spa)
 {
 	ASSERT(!dump_opt['L']);
 
 	spa_t *checkpoint_spa;
 	char *checkpoint_pool;
 	int error = 0;
 
 	/*
 	 * We import the checkpointed state of the pool (under a different
 	 * name) so we can do verification on it against the current state
 	 * of the pool.
 	 */
 	checkpoint_pool = import_checkpointed_state(spa->spa_name, NULL, B_TRUE,
 	    NULL);
 	ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0);
 
 	error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG);
 	if (error != 0) {
 		fatal("Tried to open pool \"%s\" but spa_open() failed with "
 		    "error %d\n", checkpoint_pool, error);
 	}
 
 	/*
 	 * Ensure that ranges in the checkpoint space maps of each vdev
 	 * are allocated according to the checkpointed state's metaslab
 	 * space maps.
 	 */
 	verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Ensure that allocated ranges in the checkpoint's metaslab
 	 * space maps remain allocated in the metaslab space maps of
 	 * the current state.
 	 */
 	verify_checkpoint_ms_spacemaps(checkpoint_spa, spa);
 
 	/*
 	 * Once we are done, we get rid of the checkpointed state.
 	 */
 	spa_close(checkpoint_spa, FTAG);
 	free(checkpoint_pool);
 }
 
 static void
 dump_leftover_checkpoint_blocks(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 
 		space_map_t *checkpoint_sm = NULL;
 		uint64_t checkpoint_sm_obj;
 
 		if (vd->vdev_top_zap == 0)
 			continue;
 
 		if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0)
 			continue;
 
 		VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
 		    sizeof (uint64_t), 1, &checkpoint_sm_obj));
 
 		VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa),
 		    checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift));
 		dump_spacemap(spa->spa_meta_objset, checkpoint_sm);
 		space_map_close(checkpoint_sm);
 	}
 }
 
 static int
 verify_checkpoint(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error;
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
 		return (0);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
 		 * checkpoint.
 		 */
 		(void) printf("\nPartially discarded checkpoint "
 		    "state found:\n");
 		if (dump_opt['m'] > 3)
 			dump_leftover_checkpoint_blocks(spa);
 		return (0);
 	} else if (error != 0) {
 		(void) printf("lookup error %d when looking for "
 		    "checkpointed uberblock in MOS\n", error);
 		return (error);
 	}
 	dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n");
 
 	if (checkpoint.ub_checkpoint_txg == 0) {
 		(void) printf("\nub_checkpoint_txg not set in checkpointed "
 		    "uberblock\n");
 		error = 3;
 	}
 
 	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
 }
 
 static void
 mos_leaks_cb(void *arg, uint64_t start, uint64_t size)
 {
 	(void) arg;
 	for (uint64_t i = start; i < size; i++) {
 		(void) printf("MOS object %llu referenced but not allocated\n",
 		    (u_longlong_t)i);
 	}
 }
 
 static void
 mos_obj_refd(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL)
 		zfs_range_tree_add(mos_refd_objs, obj, 1);
 }
 
 /*
  * Call on a MOS object that may already have been referenced.
  */
 static void
 mos_obj_refd_multiple(uint64_t obj)
 {
 	if (obj != 0 && mos_refd_objs != NULL &&
 	    !zfs_range_tree_contains(mos_refd_objs, obj, 1))
 		zfs_range_tree_add(mos_refd_objs, obj, 1);
 }
 
 static void
 mos_leak_vdev_top_zap(vdev_t *vd)
 {
 	uint64_t ms_flush_data_obj;
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa),
 	    vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS,
 	    sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(ms_flush_data_obj);
 }
 
 static void
 mos_leak_vdev(vdev_t *vd)
 {
 	mos_obj_refd(vd->vdev_dtl_object);
 	mos_obj_refd(vd->vdev_ms_array);
 	mos_obj_refd(vd->vdev_indirect_config.vic_births_object);
 	mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object);
 	mos_obj_refd(vd->vdev_leaf_zap);
 	if (vd->vdev_checkpoint_sm != NULL)
 		mos_obj_refd(vd->vdev_checkpoint_sm->sm_object);
 	if (vd->vdev_indirect_mapping != NULL) {
 		mos_obj_refd(vd->vdev_indirect_mapping->
 		    vim_phys->vimp_counts_object);
 	}
 	if (vd->vdev_obsolete_sm != NULL)
 		mos_obj_refd(vd->vdev_obsolete_sm->sm_object);
 
 	for (uint64_t m = 0; m < vd->vdev_ms_count; m++) {
 		metaslab_t *ms = vd->vdev_ms[m];
 		mos_obj_refd(space_map_object(ms->ms_sm));
 	}
 
 	if (vd->vdev_root_zap != 0)
 		mos_obj_refd(vd->vdev_root_zap);
 
 	if (vd->vdev_top_zap != 0) {
 		mos_obj_refd(vd->vdev_top_zap);
 		mos_leak_vdev_top_zap(vd);
 	}
 
 	for (uint64_t c = 0; c < vd->vdev_children; c++) {
 		mos_leak_vdev(vd->vdev_child[c]);
 	}
 }
 
 static void
 mos_leak_log_spacemaps(spa_t *spa)
 {
 	uint64_t spacemap_zap;
 	int error = zap_lookup(spa_meta_objset(spa),
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP,
 	    sizeof (spacemap_zap), 1, &spacemap_zap);
 	if (error == ENOENT)
 		return;
 	ASSERT0(error);
 
 	mos_obj_refd(spacemap_zap);
 	for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg);
 	    sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls))
 		mos_obj_refd(sls->sls_sm_obj);
 }
 
 static void
 errorlog_count_refd(objset_t *mos, uint64_t errlog)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	for (zap_cursor_init(&zc, mos, errlog);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		mos_obj_refd(za->za_first_integer);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 static int
 dump_mos_leaks(spa_t *spa)
 {
 	int rv = 0;
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	/* Visit and mark all referenced objects in the MOS */
 
 	mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT);
 	mos_obj_refd(spa->spa_pool_props_object);
 	mos_obj_refd(spa->spa_config_object);
 	mos_obj_refd(spa->spa_ddt_stat_object);
 	mos_obj_refd(spa->spa_feat_desc_obj);
 	mos_obj_refd(spa->spa_feat_enabled_txg_obj);
 	mos_obj_refd(spa->spa_feat_for_read_obj);
 	mos_obj_refd(spa->spa_feat_for_write_obj);
 	mos_obj_refd(spa->spa_history);
 	mos_obj_refd(spa->spa_errlog_last);
 	mos_obj_refd(spa->spa_errlog_scrub);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
 		errorlog_count_refd(mos, spa->spa_errlog_last);
 		errorlog_count_refd(mos, spa->spa_errlog_scrub);
 	}
 
 	mos_obj_refd(spa->spa_all_vdev_zaps);
 	mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj);
 	mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj);
 	bpobj_count_refd(&spa->spa_deferred_bpobj);
 	mos_obj_refd(dp->dp_empty_bpobj);
 	bpobj_count_refd(&dp->dp_obsolete_bpobj);
 	bpobj_count_refd(&dp->dp_free_bpobj);
 	mos_obj_refd(spa->spa_l2cache.sav_object);
 	mos_obj_refd(spa->spa_spares.sav_object);
 
 	if (spa->spa_syncing_log_sm != NULL)
 		mos_obj_refd(spa->spa_syncing_log_sm->sm_object);
 	mos_leak_log_spacemaps(spa);
 
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_next_mapping_object);
 	mos_obj_refd(spa->spa_condensing_indirect_phys.
 	    scip_prev_obsolete_sm_object);
 	if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) {
 		vdev_indirect_mapping_t *vim =
 		    vdev_indirect_mapping_open(mos,
 		    spa->spa_condensing_indirect_phys.scip_next_mapping_object);
 		mos_obj_refd(vim->vim_phys->vimp_counts_object);
 		vdev_indirect_mapping_close(vim);
 	}
 	deleted_livelists_dump_mos(spa);
 
 	if (dp->dp_origin_snap != NULL) {
 		dsl_dataset_t *ds;
 
 		dsl_pool_config_enter(dp, FTAG);
 		VERIFY0(dsl_dataset_hold_obj(dp,
 		    dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj,
 		    FTAG, &ds));
 		count_ds_mos_objects(ds);
 		dump_blkptr_list(&ds->ds_deadlist, "Deadlist");
 		dsl_dataset_rele(ds, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 
 		count_ds_mos_objects(dp->dp_origin_snap);
 		dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist");
 	}
 	count_dir_mos_objects(dp->dp_mos_dir);
 	if (dp->dp_free_dir != NULL)
 		count_dir_mos_objects(dp->dp_free_dir);
 	if (dp->dp_leak_dir != NULL)
 		count_dir_mos_objects(dp->dp_leak_dir);
 
 	mos_leak_vdev(spa->spa_root_vdev);
 
 	for (uint64_t c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt || ddt->ddt_version == DDT_VERSION_UNCONFIGURED)
 			continue;
 
 		/* DDT store objects */
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				mos_obj_refd(ddt->ddt_object[type][class]);
 			}
 		}
 
 		/* FDT container */
 		if (ddt->ddt_version == DDT_VERSION_FDT)
 			mos_obj_refd(ddt->ddt_dir_object);
 
 		/* FDT log objects */
 		if (ddt->ddt_flags & DDT_FLAG_LOG) {
 			mos_obj_refd(ddt->ddt_log[0].ddl_object);
 			mos_obj_refd(ddt->ddt_log[1].ddl_object);
 		}
 	}
 
 	for (uint64_t vdevid = 0; vdevid < spa->spa_brt_nvdevs; vdevid++) {
 		brt_vdev_t *brtvd = spa->spa_brt_vdevs[vdevid];
 		if (brtvd->bv_initiated) {
 			mos_obj_refd(brtvd->bv_mos_brtvdev);
 			mos_obj_refd(brtvd->bv_mos_entries);
 		}
 	}
 
 	/*
 	 * Visit all allocated objects and make sure they are referenced.
 	 */
 	uint64_t object = 0;
 	while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) {
 		if (zfs_range_tree_contains(mos_refd_objs, object, 1)) {
 			zfs_range_tree_remove(mos_refd_objs, object, 1);
 		} else {
 			dmu_object_info_t doi;
 			const char *name;
 			VERIFY0(dmu_object_info(mos, object, &doi));
 			if (doi.doi_type & DMU_OT_NEWTYPE) {
 				dmu_object_byteswap_t bswap =
 				    DMU_OT_BYTESWAP(doi.doi_type);
 				name = dmu_ot_byteswap[bswap].ob_name;
 			} else {
 				name = dmu_ot[doi.doi_type].ot_name;
 			}
 
 			(void) printf("MOS object %llu (%s) leaked\n",
 			    (u_longlong_t)object, name);
 			rv = 2;
 		}
 	}
 	(void) zfs_range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL);
 	if (!zfs_range_tree_is_empty(mos_refd_objs))
 		rv = 2;
 	zfs_range_tree_vacate(mos_refd_objs, NULL, NULL);
 	zfs_range_tree_destroy(mos_refd_objs);
 	return (rv);
 }
 
 typedef struct log_sm_obsolete_stats_arg {
 	uint64_t lsos_current_txg;
 
 	uint64_t lsos_total_entries;
 	uint64_t lsos_valid_entries;
 
 	uint64_t lsos_sm_entries;
 	uint64_t lsos_valid_sm_entries;
 } log_sm_obsolete_stats_arg_t;
 
 static int
 log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme,
     uint64_t txg, void *arg)
 {
 	log_sm_obsolete_stats_arg_t *lsos = arg;
 
 	uint64_t offset = sme->sme_offset;
 	uint64_t vdev_id = sme->sme_vdev;
 
 	if (lsos->lsos_current_txg == 0) {
 		/* this is the first log */
 		lsos->lsos_current_txg = txg;
 	} else if (lsos->lsos_current_txg < txg) {
 		/* we just changed log - print stats and reset */
 		(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 		    (u_longlong_t)lsos->lsos_valid_sm_entries,
 		    (u_longlong_t)lsos->lsos_sm_entries,
 		    (u_longlong_t)lsos->lsos_current_txg);
 		lsos->lsos_valid_sm_entries = 0;
 		lsos->lsos_sm_entries = 0;
 		lsos->lsos_current_txg = txg;
 	}
 	ASSERT3U(lsos->lsos_current_txg, ==, txg);
 
 	lsos->lsos_sm_entries++;
 	lsos->lsos_total_entries++;
 
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	if (!vdev_is_concrete(vd))
 		return (0);
 
 	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
 	ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE);
 
 	if (txg < metaslab_unflushed_txg(ms))
 		return (0);
 	lsos->lsos_valid_sm_entries++;
 	lsos->lsos_valid_entries++;
 	return (0);
 }
 
 static void
 dump_log_spacemap_obsolete_stats(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return;
 
 	log_sm_obsolete_stats_arg_t lsos = {0};
 
 	(void) printf("Log Space Map Obsolete Entry Statistics:\n");
 
 	iterate_through_spacemap_logs(spa,
 	    log_spacemap_obsolete_stats_cb, &lsos);
 
 	/* print stats for latest log */
 	(void) printf("%-8llu valid entries out of %-8llu - txg %llu\n",
 	    (u_longlong_t)lsos.lsos_valid_sm_entries,
 	    (u_longlong_t)lsos.lsos_sm_entries,
 	    (u_longlong_t)lsos.lsos_current_txg);
 
 	(void) printf("%-8llu valid entries out of %-8llu - total\n\n",
 	    (u_longlong_t)lsos.lsos_valid_entries,
 	    (u_longlong_t)lsos.lsos_total_entries);
 }
 
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
 	if (dump_opt['y']) {
 		livelist_metaslab_validate(spa);
 	}
 
 	if (dump_opt['S']) {
 		dump_simulated_ddt(spa);
 		return;
 	}
 
 	if (!dump_opt['e'] && dump_opt['C'] > 1) {
 		(void) printf("\nCached configuration:\n");
 		dump_nvlist(spa->spa_config, 8);
 	}
 
 	if (dump_opt['C'])
 		dump_config(spa);
 
 	if (dump_opt['u'])
 		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
 
 	if (dump_opt['D'])
 		dump_all_ddts(spa);
 
 	if (dump_opt['T'])
 		dump_brt(spa);
 
 	if (dump_opt['d'] > 2 || dump_opt['m'])
 		dump_metaslabs(spa);
 	if (dump_opt['M'])
 		dump_metaslab_groups(spa, dump_opt['M'] > 1);
 	if (dump_opt['d'] > 2 || dump_opt['m']) {
 		dump_log_spacemaps(spa);
 		dump_log_spacemap_obsolete_stats(spa);
 	}
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		spa_feature_t f;
 		mos_refd_objs = zfs_range_tree_create_flags(
 		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 		    0, "dump_zpool:mos_refd_objs");
 		dump_objset(dp->dp_meta_objset);
 
 		if (dump_opt['d'] >= 3) {
 			dsl_pool_t *dp = spa->spa_dsl_pool;
 			dump_full_bpobj(&spa->spa_deferred_bpobj,
 			    "Deferred frees", 0);
 			if (spa_version(spa) >= SPA_VERSION_DEADLISTS) {
 				dump_full_bpobj(&dp->dp_free_bpobj,
 				    "Pool snapshot frees", 0);
 			}
 			if (bpobj_is_open(&dp->dp_obsolete_bpobj)) {
 				ASSERT(spa_feature_is_enabled(spa,
 				    SPA_FEATURE_DEVICE_REMOVAL));
 				dump_full_bpobj(&dp->dp_obsolete_bpobj,
 				    "Pool obsolete blocks", 0);
 			}
 
 			if (spa_feature_is_active(spa,
 			    SPA_FEATURE_ASYNC_DESTROY)) {
 				dump_bptree(spa->spa_meta_objset,
 				    dp->dp_bptree_obj,
 				    "Pool dataset frees");
 			}
 			dump_dtl(spa->spa_root_vdev, 0);
 		}
 
 		for (spa_feature_t f = 0; f < SPA_FEATURES; f++)
 			global_feature_count[f] = UINT64_MAX;
 		global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0;
 		global_feature_count[SPA_FEATURE_REDACTION_LIST_SPILL] = 0;
 		global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0;
 		global_feature_count[SPA_FEATURE_LIVELIST] = 0;
 
 		(void) dmu_objset_find(spa_name(spa), dump_one_objset,
 		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 
 		if (rc == 0 && !dump_opt['L'])
 			rc = dump_mos_leaks(spa);
 
 		for (f = 0; f < SPA_FEATURES; f++) {
 			uint64_t refcount;
 
 			uint64_t *arr;
 			if (!(spa_feature_table[f].fi_flags &
 			    ZFEATURE_FLAG_PER_DATASET)) {
 				if (global_feature_count[f] == UINT64_MAX)
 					continue;
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(global_feature_count[f]);
 					continue;
 				}
 				arr = global_feature_count;
 			} else {
 				if (!spa_feature_is_enabled(spa, f)) {
 					ASSERT0(dataset_feature_count[f]);
 					continue;
 				}
 				arr = dataset_feature_count;
 			}
 			if (feature_get_refcount(spa, &spa_feature_table[f],
 			    &refcount) == ENOTSUP)
 				continue;
 			if (arr[f] != refcount) {
 				(void) printf("%s feature refcount mismatch: "
 				    "%lld consumers != %lld refcount\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)arr[f], (longlong_t)refcount);
 				rc = 2;
 			} else {
 				(void) printf("Verified %s feature refcount "
 				    "of %llu is correct\n",
 				    spa_feature_table[f].fi_uname,
 				    (longlong_t)refcount);
 			}
 		}
 
 		if (rc == 0)
 			rc = verify_device_removal_feature_counts(spa);
 	}
 
 	if (rc == 0 && (dump_opt['b'] || dump_opt['c']))
 		rc = dump_block_stats(spa);
 
 	if (rc == 0)
 		rc = verify_spacemap_refcounts(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
 	if (dump_opt['h'])
 		dump_history(spa);
 
 	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {
 		dump_debug_buffer();
 		zdb_exit(rc);
 	}
 }
 
 #define	ZDB_FLAG_CHECKSUM	0x0001
 #define	ZDB_FLAG_DECOMPRESS	0x0002
 #define	ZDB_FLAG_BSWAP		0x0004
 #define	ZDB_FLAG_GBH		0x0008
 #define	ZDB_FLAG_INDIRECT	0x0010
 #define	ZDB_FLAG_RAW		0x0020
 #define	ZDB_FLAG_PRINT_BLKPTR	0x0040
 #define	ZDB_FLAG_VERBOSE	0x0080
 
 static int flagbits[256];
 static char flagbitstr[16];
 
 static void
 zdb_print_blkptr(const blkptr_t *bp, int flags)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
 
 	snprintf_blkptr(blkbuf, sizeof (blkbuf), bp);
 	(void) printf("%s\n", blkbuf);
 }
 
 static void
 zdb_dump_indirect(blkptr_t *bp, int nbps, int flags)
 {
 	int i;
 
 	for (i = 0; i < nbps; i++)
 		zdb_print_blkptr(&bp[i], flags);
 }
 
 static void
 zdb_dump_gbh(void *buf, uint64_t size, int flags)
 {
 	zdb_dump_indirect((blkptr_t *)buf, gbh_nblkptrs(size), flags);
 }
 
 static void
 zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
 	VERIFY(write(fileno(stdout), buf, size) == size);
 }
 
 static void
 zdb_dump_block(char *label, void *buf, uint64_t size, int flags)
 {
 	uint64_t *d = (uint64_t *)buf;
 	unsigned nwords = size / sizeof (uint64_t);
 	int do_bswap = !!(flags & ZDB_FLAG_BSWAP);
 	unsigned i, j;
 	const char *hdr;
 	char *c;
 
 
 	if (do_bswap)
 		hdr = " 7 6 5 4 3 2 1 0   f e d c b a 9 8";
 	else
 		hdr = " 0 1 2 3 4 5 6 7   8 9 a b c d e f";
 
 	(void) printf("\n%s\n%6s   %s  0123456789abcdef\n", label, "", hdr);
 
 #ifdef _ZFS_LITTLE_ENDIAN
 	/* correct the endianness */
 	do_bswap = !do_bswap;
 #endif
 	for (i = 0; i < nwords; i += 2) {
 		(void) printf("%06llx:  %016llx  %016llx  ",
 		    (u_longlong_t)(i * sizeof (uint64_t)),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]),
 		    (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1]));
 
 		c = (char *)&d[i];
 		for (j = 0; j < 2 * sizeof (uint64_t); j++)
 			(void) printf("%c", isprint(c[j]) ? c[j] : '.');
 		(void) printf("\n");
 	}
 }
 
 /*
  * There are two acceptable formats:
  *	leaf_name	  - For example: c1t0d0 or /tmp/ztest.0a
  *	child[.child]*    - For example: 0.1.1
  *
  * The second form can be used to specify arbitrary vdevs anywhere
  * in the hierarchy.  For example, in a pool with a mirror of
  * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 .
  */
 static vdev_t *
 zdb_vdev_lookup(vdev_t *vdev, const char *path)
 {
 	char *s, *p, *q;
 	unsigned i;
 
 	if (vdev == NULL)
 		return (NULL);
 
 	/* First, assume the x.x.x.x format */
 	i = strtoul(path, &s, 10);
 	if (s == path || (s && *s != '.' && *s != '\0'))
 		goto name;
 	if (i >= vdev->vdev_children)
 		return (NULL);
 
 	vdev = vdev->vdev_child[i];
 	if (s && *s == '\0')
 		return (vdev);
 	return (zdb_vdev_lookup(vdev, s+1));
 
 name:
 	for (i = 0; i < vdev->vdev_children; i++) {
 		vdev_t *vc = vdev->vdev_child[i];
 
 		if (vc->vdev_path == NULL) {
 			vc = zdb_vdev_lookup(vc, path);
 			if (vc == NULL)
 				continue;
 			else
 				return (vc);
 		}
 
 		p = strrchr(vc->vdev_path, '/');
 		p = p ? p + 1 : vc->vdev_path;
 		q = &vc->vdev_path[strlen(vc->vdev_path) - 2];
 
 		if (strcmp(vc->vdev_path, path) == 0)
 			return (vc);
 		if (strcmp(p, path) == 0)
 			return (vc);
 		if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0)
 			return (vc);
 	}
 
 	return (NULL);
 }
 
 static int
 name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr)
 {
 	dsl_dataset_t *ds;
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id,
 	    NULL, &ds);
 	if (error != 0) {
 		(void) fprintf(stderr, "failed to hold objset %llu: %s\n",
 		    (u_longlong_t)objset_id, strerror(error));
 		dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 		return (error);
 	}
 	dsl_dataset_name(ds, outstr);
 	dsl_dataset_rele(ds, NULL);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	return (0);
 }
 
 static boolean_t
 zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize)
 {
 	char *s0, *s1, *tmp = NULL;
 
 	if (sizes == NULL)
 		return (B_FALSE);
 
 	s0 = strtok_r(sizes, "/", &tmp);
 	if (s0 == NULL)
 		return (B_FALSE);
 	s1 = strtok_r(NULL, "/", &tmp);
 	*lsize = strtoull(s0, NULL, 16);
 	*psize = s1 ? strtoull(s1, NULL, 16) : *lsize;
 	return (*lsize >= *psize && *psize > 0);
 }
 
 #define	ZIO_COMPRESS_MASK(alg)	(1ULL << (ZIO_COMPRESS_##alg))
 
 static boolean_t
 try_decompress_block(abd_t *pabd, uint64_t lsize, uint64_t psize,
     int flags, int cfunc, void *lbuf, void *lbuf2)
 {
 	if (flags & ZDB_FLAG_VERBOSE) {
 		(void) fprintf(stderr,
 		    "Trying %05llx -> %05llx (%s)\n",
 		    (u_longlong_t)psize,
 		    (u_longlong_t)lsize,
 		    zio_compress_table[cfunc].ci_name);
 	}
 
 	/*
 	 * We set lbuf to all zeros and lbuf2 to all
 	 * ones, then decompress to both buffers and
 	 * compare their contents. This way we can
 	 * know if decompression filled exactly to
 	 * lsize or if it left some bytes unwritten.
 	 */
 
 	memset(lbuf, 0x00, lsize);
 	memset(lbuf2, 0xff, lsize);
 
 	abd_t labd, labd2;
 	abd_get_from_buf_struct(&labd, lbuf, lsize);
 	abd_get_from_buf_struct(&labd2, lbuf2, lsize);
 
 	boolean_t ret = B_FALSE;
 	if (zio_decompress_data(cfunc, pabd,
 	    &labd, psize, lsize, NULL) == 0 &&
 	    zio_decompress_data(cfunc, pabd,
 	    &labd2, psize, lsize, NULL) == 0 &&
 	    memcmp(lbuf, lbuf2, lsize) == 0)
 		ret = B_TRUE;
 
 	abd_free(&labd2);
 	abd_free(&labd);
 
 	return (ret);
 }
 
 static uint64_t
 zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize,
     uint64_t psize, int flags)
 {
 	(void) buf;
 	uint64_t orig_lsize = lsize;
 	boolean_t tryzle = ((getenv("ZDB_NO_ZLE") == NULL));
 	/*
 	 * We don't know how the data was compressed, so just try
 	 * every decompress function at every inflated blocksize.
 	 */
 	void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 	int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 };
 	int *cfuncp = cfuncs;
 	uint64_t maxlsize = SPA_MAXBLOCKSIZE;
 	uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) |
 	    ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) |
 	    ZIO_COMPRESS_MASK(ZLE);
 	*cfuncp++ = ZIO_COMPRESS_LZ4;
 	*cfuncp++ = ZIO_COMPRESS_LZJB;
 	mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB);
 	/*
 	 * Every gzip level has the same decompressor, no need to
 	 * run it 9 times per bruteforce attempt.
 	 */
 	mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3);
 	mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5);
 	mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7);
 	mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9);
 	for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++)
 		if (((1ULL << c) & mask) == 0)
 			*cfuncp++ = c;
 
 	/*
 	 * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this
 	 * could take a while and we should let the user know
 	 * we are not stuck.  On the other hand, printing progress
 	 * info gets old after a while.  User can specify 'v' flag
 	 * to see the progression.
 	 */
 	if (lsize == psize)
 		lsize += SPA_MINBLOCKSIZE;
 	else
 		maxlsize = lsize;
 
 	for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) {
 		for (cfuncp = cfuncs; *cfuncp; cfuncp++) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    *cfuncp, lbuf, lbuf2)) {
 				tryzle = B_FALSE;
 				break;
 			}
 		}
 		if (*cfuncp != 0)
 			break;
 	}
 	if (tryzle) {
 		for (lsize = orig_lsize; lsize <= maxlsize;
 		    lsize += SPA_MINBLOCKSIZE) {
 			if (try_decompress_block(pabd, lsize, psize, flags,
 			    ZIO_COMPRESS_ZLE, lbuf, lbuf2)) {
 				*cfuncp = ZIO_COMPRESS_ZLE;
 				break;
 			}
 		}
 	}
 	umem_free(lbuf2, SPA_MAXBLOCKSIZE);
 
 	if (*cfuncp == ZIO_COMPRESS_ZLE) {
 		printf("\nZLE decompression was selected. If you "
 		    "suspect the results are wrong,\ntry avoiding ZLE "
 		    "by setting and exporting ZDB_NO_ZLE=\"true\"\n");
 	}
 
 	return (lsize > maxlsize ? -1 : lsize);
 }
 
 /*
  * Read a block from a pool and print it out.  The syntax of the
  * block descriptor is:
  *
  *	pool:vdev_specifier:offset:[lsize/]psize[:flags]
  *
  *	pool           - The name of the pool you wish to read from
  *	vdev_specifier - Which vdev (see comment for zdb_vdev_lookup)
  *	offset         - offset, in hex, in bytes
  *	size           - Amount of data to read, in hex, in bytes
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		 c: Calculate and display checksums
  *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
  *		 g: Display data as a gang block header
  *		 i: Display as an indirect block
  *		 r: Dump raw data to stdout
  *		 v: Verbose
  *
  */
 static void
 zdb_read_block(char *thing, spa_t *spa)
 {
 	blkptr_t blk, *bp = &blk;
 	dva_t *dva = bp->blk_dva;
 	int flags = 0;
 	uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
 	abd_t *pabd;
 	void *lbuf, *buf;
 	char *s, *p, *dup, *flagstr, *sizes, *tmp = NULL;
 	const char *vdev, *errmsg = NULL;
 	int i, len, error;
 	boolean_t borrowed = B_FALSE, found = B_FALSE;
 
 	dup = strdup(thing);
 	s = strtok_r(dup, ":", &tmp);
 	vdev = s ?: "";
 	s = strtok_r(NULL, ":", &tmp);
 	offset = strtoull(s ? s : "", NULL, 16);
 	sizes = strtok_r(NULL, ":", &tmp);
 	s = strtok_r(NULL, ":", &tmp);
 	flagstr = strdup(s ?: "");
 
 	if (!zdb_parse_block_sizes(sizes, &lsize, &psize))
 		errmsg = "invalid size(s)";
 	if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE))
 		errmsg = "size must be a multiple of sector size";
 	if (!IS_P2ALIGNED(offset, DEV_BSIZE))
 		errmsg = "offset must be a multiple of sector size";
 	if (errmsg) {
 		(void) printf("Invalid block specifier: %s  - %s\n",
 		    thing, errmsg);
 		goto done;
 	}
 
 	tmp = NULL;
 	for (s = strtok_r(flagstr, ":", &tmp);
 	    s != NULL;
 	    s = strtok_r(NULL, ":", &tmp)) {
 		len = strlen(flagstr);
 		for (i = 0; i < len; i++) {
 			int bit = flagbits[(uchar_t)flagstr[i]];
 
 			if (bit == 0) {
 				(void) printf("***Ignoring flag: %c\n",
 				    (uchar_t)flagstr[i]);
 				continue;
 			}
 			found = B_TRUE;
 			flags |= bit;
 
 			p = &flagstr[i + 1];
 			if (*p != ':' && *p != '\0') {
 				int j = 0, nextbit = flagbits[(uchar_t)*p];
 				char *end, offstr[8] = { 0 };
 				if ((bit == ZDB_FLAG_PRINT_BLKPTR) &&
 				    (nextbit == 0)) {
 					/* look ahead to isolate the offset */
 					while (nextbit == 0 &&
 					    strchr(flagbitstr, *p) == NULL) {
 						offstr[j] = *p;
 						j++;
 						if (i + j > strlen(flagstr))
 							break;
 						p++;
 						nextbit = flagbits[(uchar_t)*p];
 					}
 					blkptr_offset = strtoull(offstr, &end,
 					    16);
 					i += j;
 				} else if (nextbit == 0) {
 					(void) printf("***Ignoring flag arg:"
 					    " '%c'\n", (uchar_t)*p);
 				}
 			}
 		}
 	}
 	if (blkptr_offset % sizeof (blkptr_t)) {
 		printf("Block pointer offset 0x%llx "
 		    "must be divisible by 0x%x\n",
 		    (longlong_t)blkptr_offset, (int)sizeof (blkptr_t));
 		goto done;
 	}
 	if (found == B_FALSE && strlen(flagstr) > 0) {
 		printf("Invalid flag arg: '%s'\n", flagstr);
 		goto done;
 	}
 
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
 		goto done;
 	} else {
 		if (vd->vdev_path)
 			(void) fprintf(stderr, "Found vdev: %s\n",
 			    vd->vdev_path);
 		else
 			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
 	pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE);
 	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
 
 	BP_ZERO(bp);
 
 	DVA_SET_VDEV(&dva[0], vd->vdev_id);
 	DVA_SET_OFFSET(&dva[0], offset);
 	DVA_SET_GANG(&dva[0], 0);
 	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
 
 	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
 	BP_SET_LSIZE(bp, lsize);
 	BP_SET_PSIZE(bp, psize);
 	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(bp, DMU_OT_NONE);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 0);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 
 	if (vd == vd->vdev_top) {
 		/*
 		 * Treat this as a normal block read.
 		 */
 		zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL,
 		    ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
 	} else {
 		/*
 		 * Treat this as a vdev child I/O.
 		 */
 		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd,
 		    psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
 		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL,
 		    NULL, NULL));
 	}
 
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (error) {
 		(void) printf("Read of %s failed, error: %d\n", thing, error);
 		goto out;
 	}
 
 	uint64_t orig_lsize = lsize;
 	buf = lbuf;
 	if (flags & ZDB_FLAG_DECOMPRESS) {
 		lsize = zdb_decompress_block(pabd, buf, lbuf,
 		    lsize, psize, flags);
 		if (lsize == -1) {
 			(void) printf("Decompress of %s failed\n", thing);
 			goto out;
 		}
 	} else {
 		buf = abd_borrow_buf_copy(pabd, lsize);
 		borrowed = B_TRUE;
 	}
 	/*
 	 * Try to detect invalid block pointer.  If invalid, try
 	 * decompressing.
 	 */
 	if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) &&
 	    !(flags & ZDB_FLAG_DECOMPRESS)) {
 		const blkptr_t *b = (const blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 		if (zfs_blkptr_verify(spa, b,
 		    BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY)) {
 			abd_return_buf_copy(pabd, buf, lsize);
 			borrowed = B_FALSE;
 			buf = lbuf;
 			lsize = zdb_decompress_block(pabd, buf,
 			    lbuf, lsize, psize, flags);
 			b = (const blkptr_t *)(void *)
 			    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
 			if (lsize == -1 || zfs_blkptr_verify(spa, b,
 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 				printf("invalid block pointer at this DVA\n");
 				goto out;
 			}
 		}
 	}
 
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
 	else if (flags & ZDB_FLAG_RAW)
 		zdb_dump_block_raw(buf, lsize, flags);
 	else if (flags & ZDB_FLAG_INDIRECT)
 		zdb_dump_indirect((blkptr_t *)buf,
 		    orig_lsize / sizeof (blkptr_t), flags);
 	else if (flags & ZDB_FLAG_GBH)
 		zdb_dump_gbh(buf, lsize, flags);
 	else
 		zdb_dump_block(thing, buf, lsize, flags);
 
 	/*
 	 * If :c was specified, iterate through the checksum table to
 	 * calculate and display each checksum for our specified
 	 * DVA and length.
 	 */
 	if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) &&
 	    !(flags & ZDB_FLAG_GBH)) {
 		zio_t *czio;
 		(void) printf("\n");
 		for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL;
 		    ck < ZIO_CHECKSUM_FUNCTIONS; ck++) {
 
 			if ((zio_checksum_table[ck].ci_flags &
 			    ZCHECKSUM_FLAG_EMBEDDED) ||
 			    ck == ZIO_CHECKSUM_NOPARITY) {
 				continue;
 			}
 			BP_SET_CHECKSUM(bp, ck);
 			spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 			czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			if (vd == vd->vdev_top) {
 				zio_nowait(zio_read(czio, spa, bp, pabd, psize,
 				    NULL, NULL,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_DONT_RETRY, NULL));
 			} else {
 				zio_nowait(zio_vdev_child_io(czio, bp, vd,
 				    offset, pabd, psize, ZIO_TYPE_READ,
 				    ZIO_PRIORITY_SYNC_READ,
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY |
 				    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW |
 				    ZIO_FLAG_SPECULATIVE |
 				    ZIO_FLAG_OPTIONAL, NULL, NULL));
 			}
 			error = zio_wait(czio);
 			if (error == 0 || error == ECKSUM) {
 				zio_t *ck_zio = zio_null(NULL, spa, NULL,
 				    NULL, NULL, 0);
 				ck_zio->io_offset =
 				    DVA_GET_OFFSET(&bp->blk_dva[0]);
 				ck_zio->io_bp = bp;
 				zio_checksum_compute(ck_zio, ck, pabd, psize);
 				printf(
 				    "%12s\t"
 				    "cksum=%016llx:%016llx:%016llx:%016llx\n",
 				    zio_checksum_table[ck].ci_name,
 				    (u_longlong_t)bp->blk_cksum.zc_word[0],
 				    (u_longlong_t)bp->blk_cksum.zc_word[1],
 				    (u_longlong_t)bp->blk_cksum.zc_word[2],
 				    (u_longlong_t)bp->blk_cksum.zc_word[3]);
 				zio_wait(ck_zio);
 			} else {
 				printf("error %d reading block\n", error);
 			}
 			spa_config_exit(spa, SCL_STATE, FTAG);
 		}
 	}
 
 	if (borrowed)
 		abd_return_buf_copy(pabd, buf, lsize);
 
 out:
 	abd_free(pabd);
 	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 done:
 	free(flagstr);
 	free(dup);
 }
 
 static void
 zdb_embedded_block(char *thing)
 {
 	blkptr_t bp = {{{{0}}}};
 	unsigned long long *words = (void *)&bp;
 	char *buf;
 	int err;
 
 	err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:"
 	    "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx",
 	    words + 0, words + 1, words + 2, words + 3,
 	    words + 4, words + 5, words + 6, words + 7,
 	    words + 8, words + 9, words + 10, words + 11,
 	    words + 12, words + 13, words + 14, words + 15);
 	if (err != 16) {
 		(void) fprintf(stderr, "invalid input format\n");
 		zdb_exit(1);
 	}
 	ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE);
 	buf = malloc(SPA_MAXBLOCKSIZE);
 	if (buf == NULL) {
 		(void) fprintf(stderr, "out of memory\n");
 		zdb_exit(1);
 	}
 	err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp));
 	if (err != 0) {
 		(void) fprintf(stderr, "decode failed: %u\n", err);
 		zdb_exit(1);
 	}
 	zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0);
 	free(buf);
 }
 
 /* check for valid hex or decimal numeric string */
 static boolean_t
 zdb_numeric(char *str)
 {
 	int i = 0, len;
 
 	len = strlen(str);
 	if (len == 0)
 		return (B_FALSE);
 	if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 2) == 0)
 		i = 2;
 	for (; i < len; i++) {
 		if (!isxdigit(str[i]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 static int
 dummy_get_file_info(dmu_object_type_t bonustype, const void *data,
     zfs_file_info_t *zoi)
 {
 	(void) data, (void) zoi;
 
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (ENOENT);
 
 	(void) fprintf(stderr, "dummy_get_file_info: not implemented");
 	abort();
 }
 
 int
 main(int argc, char **argv)
 {
 	int c;
 	int dump_all = 1;
 	int verbose = 0;
 	int error = 0;
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN];
 	nvlist_t *policy = NULL;
 	uint64_t max_txg = UINT64_MAX;
 	int64_t objset_id = -1;
 	uint64_t object;
 	int flags = ZFS_IMPORT_MISSING_LOG;
 	int rewind = ZPOOL_NEVER_REWIND;
 	char *spa_config_path_env, *objset_str;
 	boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE;
 	nvlist_t *cfg = NULL;
 	struct sigaction action;
 	boolean_t force_import = B_FALSE;
 	boolean_t config_path_console = B_FALSE;
 	char pbuf[MAXPATHLEN];
 
 	dprintf_setup(&argc, argv);
 
 	/*
 	 * Set up signal handlers, so if we crash due to bad on-disk data we
 	 * can get more info. Unlike ztest, we don't bail out if we can't set
 	 * up signal handlers, because zdb is very useful without them.
 	 */
 	action.sa_handler = sig_handler;
 	sigemptyset(&action.sa_mask);
 	action.sa_flags = 0;
 	if (sigaction(SIGSEGV, &action, NULL) < 0) {
 		(void) fprintf(stderr, "zdb: cannot catch SIGSEGV: %s\n",
 		    strerror(errno));
 	}
 	if (sigaction(SIGABRT, &action, NULL) < 0) {
 		(void) fprintf(stderr, "zdb: cannot catch SIGABRT: %s\n",
 		    strerror(errno));
 	}
 
 	/*
 	 * If there is an environment variable SPA_CONFIG_PATH it overrides
 	 * default spa_config_path setting. If -U flag is specified it will
 	 * override this environment variable settings once again.
 	 */
 	spa_config_path_env = getenv("SPA_CONFIG_PATH");
 	if (spa_config_path_env != NULL)
 		spa_config_path = spa_config_path_env;
 
 	/*
 	 * For performance reasons, we set this tunable down. We do so before
 	 * the arg parsing section so that the user can override this value if
 	 * they choose.
 	 */
 	zfs_btree_verify_intensity = 3;
 
 	struct option long_options[] = {
 		{"ignore-assertions",	no_argument,		NULL, 'A'},
 		{"block-stats",		no_argument,		NULL, 'b'},
 		{"backup",		no_argument,		NULL, 'B'},
 		{"checksum",		no_argument,		NULL, 'c'},
 		{"config",		no_argument,		NULL, 'C'},
 		{"datasets",		no_argument,		NULL, 'd'},
 		{"dedup-stats",		no_argument,		NULL, 'D'},
 		{"exported",		no_argument,		NULL, 'e'},
 		{"embedded-block-pointer",	no_argument,	NULL, 'E'},
 		{"automatic-rewind",	no_argument,		NULL, 'F'},
 		{"dump-debug-msg",	no_argument,		NULL, 'G'},
 		{"history",		no_argument,		NULL, 'h'},
 		{"intent-logs",		no_argument,		NULL, 'i'},
 		{"inflight",		required_argument,	NULL, 'I'},
 		{"checkpointed-state",	no_argument,		NULL, 'k'},
 		{"key",			required_argument,	NULL, 'K'},
 		{"label",		no_argument,		NULL, 'l'},
 		{"disable-leak-tracking",	no_argument,	NULL, 'L'},
 		{"metaslabs",		no_argument,		NULL, 'm'},
 		{"metaslab-groups",	no_argument,		NULL, 'M'},
 		{"numeric",		no_argument,		NULL, 'N'},
 		{"option",		required_argument,	NULL, 'o'},
 		{"object-lookups",	no_argument,		NULL, 'O'},
 		{"path",		required_argument,	NULL, 'p'},
 		{"parseable",		no_argument,		NULL, 'P'},
 		{"skip-label",		no_argument,		NULL, 'q'},
 		{"copy-object",		no_argument,		NULL, 'r'},
 		{"read-block",		no_argument,		NULL, 'R'},
 		{"io-stats",		no_argument,		NULL, 's'},
 		{"simulate-dedup",	no_argument,		NULL, 'S'},
 		{"txg",			required_argument,	NULL, 't'},
 		{"brt-stats",		no_argument,		NULL, 'T'},
 		{"uberblock",		no_argument,		NULL, 'u'},
 		{"cachefile",		required_argument,	NULL, 'U'},
 		{"verbose",		no_argument,		NULL, 'v'},
 		{"verbatim",		no_argument,		NULL, 'V'},
 		{"dump-blocks",		required_argument,	NULL, 'x'},
 		{"extreme-rewind",	no_argument,		NULL, 'X'},
 		{"all-reconstruction",	no_argument,		NULL, 'Y'},
 		{"livelist",		no_argument,		NULL, 'y'},
 		{"zstd-headers",	no_argument,		NULL, 'Z'},
 		{0, 0, 0, 0}
 	};
 
 	while ((c = getopt_long(argc, argv,
 	    "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:TuU:vVx:XYyZ",
 	    long_options, NULL)) != -1) {
 		switch (c) {
 		case 'b':
 		case 'B':
 		case 'c':
 		case 'C':
 		case 'd':
 		case 'D':
 		case 'E':
 		case 'G':
 		case 'h':
 		case 'i':
 		case 'l':
 		case 'm':
 		case 'M':
 		case 'N':
 		case 'O':
 		case 'r':
 		case 'R':
 		case 's':
 		case 'S':
 		case 'T':
 		case 'u':
 		case 'y':
 		case 'Z':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
 		case 'A':
 		case 'e':
 		case 'F':
 		case 'k':
 		case 'L':
 		case 'P':
 		case 'q':
 		case 'X':
 			dump_opt[c]++;
 			break;
 		case 'Y':
 			zfs_reconstruct_indirect_combinations_max = INT_MAX;
 			zfs_deadman_enabled = 0;
 			break;
 		/* NB: Sort single match options below. */
 		case 'I':
 			max_inflight_bytes = strtoull(optarg, NULL, 0);
 			if (max_inflight_bytes == 0) {
 				(void) fprintf(stderr, "maximum number "
 				    "of inflight bytes must be greater "
 				    "than 0\n");
 				usage();
 			}
 			break;
 		case 'K':
 			dump_opt[c]++;
 			key_material = strdup(optarg);
 			/* redact key material in process table */
 			while (*optarg != '\0') { *optarg++ = '*'; }
 			break;
 		case 'o':
 			dump_opt[c]++;
 			dump_all = 0;
 			error = handle_tunable_option(optarg, B_FALSE);
 			if (error != 0)
 				zdb_exit(1);
 			break;
 		case 'p':
 			if (searchdirs == NULL) {
 				searchdirs = umem_alloc(sizeof (char *),
 				    UMEM_NOFAIL);
 			} else {
 				char **tmp = umem_alloc((nsearch + 1) *
 				    sizeof (char *), UMEM_NOFAIL);
 				memcpy(tmp, searchdirs, nsearch *
 				    sizeof (char *));
 				umem_free(searchdirs,
 				    nsearch * sizeof (char *));
 				searchdirs = tmp;
 			}
 			searchdirs[nsearch++] = optarg;
 			break;
 		case 't':
 			max_txg = strtoull(optarg, NULL, 0);
 			if (max_txg < TXG_INITIAL) {
 				(void) fprintf(stderr, "incorrect txg "
 				    "specified: %s\n", optarg);
 				usage();
 			}
 			break;
 		case 'U':
 			config_path_console = B_TRUE;
 			spa_config_path = optarg;
 			if (spa_config_path[0] != '/') {
 				(void) fprintf(stderr,
 				    "cachefile must be an absolute path "
 				    "(i.e. start with a slash)\n");
 				usage();
 			}
 			break;
 		case 'v':
 			verbose++;
 			break;
 		case 'V':
 			flags = ZFS_IMPORT_VERBATIM;
 			break;
 		case 'x':
 			vn_dumpdir = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
 #if defined(_LP64)
 	/*
 	 * ZDB does not typically re-read blocks; therefore limit the ARC
 	 * to 256 MB, which can be used entirely for metadata.
 	 */
 	zfs_arc_min = 2ULL << SPA_MAXBLOCKSHIFT;
 	zfs_arc_max = 256 * 1024 * 1024;
 #endif
 
 	/*
 	 * "zdb -c" uses checksum-verifying scrub i/os which are async reads.
 	 * "zdb -b" uses traversal prefetch which uses async reads.
 	 * For good performance, let several of them be active at once.
 	 */
 	zfs_vdev_async_read_max_active = 10;
 
 	/*
 	 * Disable reference tracking for better performance.
 	 */
 	reference_tracking_enable = B_FALSE;
 
 	/*
 	 * Do not fail spa_load when spa_load_verify fails. This is needed
 	 * to load non-idle pools.
 	 */
 	spa_load_verify_dryrun = B_TRUE;
 
 	/*
 	 * ZDB should have ability to read spacemaps.
 	 */
 	spa_mode_readable_spacemaps = B_TRUE;
 
 	if (dump_all)
 		verbose = MAX(verbose, 1);
 
 	for (c = 0; c < 256; c++) {
 		if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL)
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
 	libspl_set_assert_ok((dump_opt['A'] == 1) || (dump_opt['A'] > 2));
 	zfs_recover = (dump_opt['A'] > 1);
 
 	argc -= optind;
 	argv += optind;
 	if (argc < 2 && dump_opt['R'])
 		usage();
 
 	target = argv[0];
 
 	/*
 	 * Automate cachefile
 	 */
 	if (!spa_config_path_env && !config_path_console && target &&
 	    libzfs_core_init() == 0) {
 		char *pname = strdup(target);
 		const char *value;
 		nvlist_t *pnvl = NULL;
 		nvlist_t *vnvl = NULL;
 
 		if (strpbrk(pname, "/@") != NULL)
 			*strpbrk(pname, "/@") = '\0';
 
 		if (pname && lzc_get_props(pname, &pnvl) == 0) {
 			if (nvlist_lookup_nvlist(pnvl, "cachefile",
 			    &vnvl) == 0) {
 				value = fnvlist_lookup_string(vnvl,
 				    ZPROP_VALUE);
 			} else {
 				value = "-";
 			}
 			strlcpy(pbuf, value, sizeof (pbuf));
 			if (pbuf[0] != '\0') {
 				if (pbuf[0] == '/') {
 					if (access(pbuf, F_OK) == 0)
 						spa_config_path = pbuf;
 					else
 						force_import = B_TRUE;
 				} else if ((strcmp(pbuf, "-") == 0 &&
 				    access(ZPOOL_CACHE, F_OK) != 0) ||
 				    strcmp(pbuf, "none") == 0) {
 					force_import = B_TRUE;
 				}
 			}
 			nvlist_free(vnvl);
 		}
 
 		free(pname);
 		nvlist_free(pnvl);
 		libzfs_core_fini();
 	}
 
 	dmu_objset_register_type(DMU_OST_ZFS, dummy_get_file_info);
 	kernel_init(SPA_MODE_READ);
 	kernel_init_done = B_TRUE;
 
 	if (dump_opt['E']) {
 		if (argc != 1)
 			usage();
 		zdb_embedded_block(argv[0]);
 		error = 0;
 		goto fini;
 	}
 
 	if (argc < 1) {
 		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			error = 0;
 			goto fini;
 		}
 		if (dump_opt['o'])
 			/*
 			 * Avoid blasting tunable options off the top of the
 			 * screen.
 			 */
 			zdb_exit(1);
 		usage();
 	}
 
 	if (dump_opt['l']) {
 		error = dump_label(argv[0]);
 		goto fini;
 	}
 
 	if (dump_opt['X'] || dump_opt['F'])
 		rewind = ZPOOL_DO_REWIND |
 		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
 	/* -N implies -d */
 	if (dump_opt['N'] && dump_opt['d'] == 0)
 		dump_opt['d'] = dump_opt['N'];
 
 	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
 	    nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 ||
 	    nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0)
 		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
 
 	if (strpbrk(target, "/@") != NULL) {
 		size_t targetlen;
 
 		target_pool = strdup(target);
 		*strpbrk(target_pool, "/@") = '\0';
 
 		target_is_spa = B_FALSE;
 		targetlen = strlen(target);
 		if (targetlen && target[targetlen - 1] == '/')
 			target[targetlen - 1] = '\0';
 
 		/*
 		 * See if an objset ID was supplied (-d <pool>/<objset ID>).
 		 * To disambiguate tank/100, consider the 100 as objsetID
 		 * if -N was given, otherwise 100 is an objsetID iff
 		 * tank/100 as a named dataset fails on lookup.
 		 */
 		objset_str = strchr(target, '/');
 		if (objset_str && strlen(objset_str) > 1 &&
 		    zdb_numeric(objset_str + 1)) {
 			char *endptr;
 			errno = 0;
 			objset_str++;
 			objset_id = strtoull(objset_str, &endptr, 0);
 			/* dataset 0 is the same as opening the pool */
 			if (errno == 0 && endptr != objset_str &&
 			    objset_id != 0) {
 				if (dump_opt['N'])
 					dataset_lookup = B_TRUE;
 			}
 			/* normal dataset name not an objset ID */
 			if (endptr == objset_str) {
 				objset_id = -1;
 			}
 		} else if (objset_str && !zdb_numeric(objset_str + 1) &&
 		    dump_opt['N']) {
 			printf("Supply a numeric objset ID with -N\n");
 			error = 2;
 			goto fini;
 		}
 	} else {
 		target_pool = target;
 	}
 
 	if (dump_opt['e'] || force_import) {
 		importargs_t args = { 0 };
 
 		/*
 		 * If path is not provided, search in /dev
 		 */
 		if (searchdirs == NULL) {
 			searchdirs = umem_alloc(sizeof (char *), UMEM_NOFAIL);
 			searchdirs[nsearch++] = (char *)ZFS_DEVDIR;
 		}
 
 		args.paths = nsearch;
 		args.path = searchdirs;
 		args.can_be_active = B_TRUE;
 
 		libpc_handle_t lpch = {
 			.lpc_lib_handle = NULL,
 			.lpc_ops = &libzpool_config_ops,
 			.lpc_printerr = B_TRUE
 		};
 		error = zpool_find_config(&lpch, target_pool, &cfg, &args);
 
 		if (error == 0) {
 
 			if (nvlist_add_nvlist(cfg,
 			    ZPOOL_LOAD_POLICY, policy) != 0) {
 				fatal("can't open '%s': %s",
 				    target, strerror(ENOMEM));
 			}
 
 			if (dump_opt['C'] > 1) {
 				(void) printf("\nConfiguration for import:\n");
 				dump_nvlist(cfg, 8);
 			}
 
 			/*
 			 * Disable the activity check to allow examination of
 			 * active pools.
 			 */
 			error = spa_import(target_pool, cfg, NULL,
 			    flags | ZFS_IMPORT_SKIP_MMP);
 		}
 	}
 
 	if (searchdirs != NULL) {
 		umem_free(searchdirs, nsearch * sizeof (char *));
 		searchdirs = NULL;
 	}
 
 	/*
 	 * We need to make sure to process -O option or call
 	 * dump_path after the -e option has been processed,
 	 * which imports the pool to the namespace if it's
 	 * not in the cachefile.
 	 */
 	if (dump_opt['O']) {
 		if (argc != 2)
 			usage();
 		dump_opt['v'] = verbose + 3;
 		error = dump_path(argv[0], argv[1], NULL);
 		goto fini;
 	}
 
 	if (dump_opt['r']) {
 		target_is_spa = B_FALSE;
 		if (argc != 3)
 			usage();
 		dump_opt['v'] = verbose;
 		error = dump_path(argv[0], argv[1], &object);
 		if (error != 0)
 			fatal("internal error: %s", strerror(error));
 	}
 
 	/*
 	 * import_checkpointed_state makes the assumption that the
 	 * target pool that we pass it is already part of the spa
 	 * namespace. Because of that we need to make sure to call
 	 * it always after the -e option has been processed, which
 	 * imports the pool to the namespace if it's not in the
 	 * cachefile.
 	 */
 	char *checkpoint_pool = NULL;
 	char *checkpoint_target = NULL;
 	if (dump_opt['k']) {
 		checkpoint_pool = import_checkpointed_state(target, cfg,
 		    target_is_spa, &checkpoint_target);
 
 		if (checkpoint_target != NULL)
 			target = checkpoint_target;
 	}
 
 	if (cfg != NULL) {
 		nvlist_free(cfg);
 		cfg = NULL;
 	}
 
 	if (target_pool != target)
 		free(target_pool);
 
 	if (error == 0) {
 		if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) {
 			ASSERT(checkpoint_pool != NULL);
 			ASSERT0P(checkpoint_target);
 
 			error = spa_open(checkpoint_pool, &spa, FTAG);
 			if (error != 0) {
 				fatal("Tried to open pool \"%s\" but "
 				    "spa_open() failed with error %d\n",
 				    checkpoint_pool, error);
 			}
 
 		} else if (target_is_spa || dump_opt['R'] || dump_opt['B'] ||
 		    objset_id == 0) {
 			zdb_set_skip_mmp(target);
 			error = spa_open_rewind(target, &spa, FTAG, policy,
 			    NULL);
 			if (error) {
 				/*
 				 * If we're missing the log device then
 				 * try opening the pool after clearing the
 				 * log state.
 				 */
 				mutex_enter(&spa_namespace_lock);
 				if ((spa = spa_lookup(target)) != NULL &&
 				    spa->spa_log_state == SPA_LOG_MISSING) {
 					spa->spa_log_state = SPA_LOG_CLEAR;
 					error = 0;
 				}
 				mutex_exit(&spa_namespace_lock);
 
 				if (!error) {
 					error = spa_open_rewind(target, &spa,
 					    FTAG, policy, NULL);
 				}
 			}
 		} else if (strpbrk(target, "#") != NULL) {
 			dsl_pool_t *dp;
 			error = dsl_pool_hold(target, FTAG, &dp);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			error = dump_bookmark(dp, target, B_TRUE, verbose > 1);
 			dsl_pool_rele(dp, FTAG);
 			if (error != 0) {
 				fatal("can't dump '%s': %s", target,
 				    strerror(error));
 			}
 			goto fini;
 		} else {
 			target_pool = strdup(target);
 			if (strpbrk(target, "/@") != NULL)
 				*strpbrk(target_pool, "/@") = '\0';
 
 			zdb_set_skip_mmp(target);
 			/*
 			 * If -N was supplied, the user has indicated that
 			 * zdb -d <pool>/<objsetID> is in effect.  Otherwise
 			 * we first assume that the dataset string is the
 			 * dataset name.  If dmu_objset_hold fails with the
 			 * dataset string, and we have an objset_id, retry the
 			 * lookup with the objsetID.
 			 */
 			boolean_t retry = B_TRUE;
 retry_lookup:
 			if (dataset_lookup == B_TRUE) {
 				/*
 				 * Use the supplied id to get the name
 				 * for open_objset.
 				 */
 				error = spa_open(target_pool, &spa, FTAG);
 				if (error == 0) {
 					error = name_from_objset_id(spa,
 					    objset_id, dsname);
 					spa_close(spa, FTAG);
 					if (error == 0)
 						target = dsname;
 				}
 			}
 			if (error == 0) {
 				if (objset_id > 0 && retry) {
 					int err = dmu_objset_hold(target, FTAG,
 					    &os);
 					if (err) {
 						dataset_lookup = B_TRUE;
 						retry = B_FALSE;
 						goto retry_lookup;
 					} else {
 						dmu_objset_rele(os, FTAG);
 					}
 				}
 				error = open_objset(target, FTAG, &os);
 			}
 			if (error == 0)
 				spa = dmu_objset_spa(os);
 			free(target_pool);
 		}
 	}
 	nvlist_free(policy);
 
 	if (error)
 		fatal("can't open '%s': %s", target, strerror(error));
 
 	/*
 	 * Set the pool failure mode to panic in order to prevent the pool
 	 * from suspending.  A suspended I/O will have no way to resume and
 	 * can prevent the zdb(8) command from terminating as expected.
 	 */
 	if (spa != NULL)
 		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	argv++;
 	argc--;
 	if (dump_opt['r']) {
 		error = zdb_copy_object(os, object, argv[1]);
 	} else if (!dump_opt['R']) {
 		flagbits['d'] = ZOR_FLAG_DIRECTORY;
 		flagbits['f'] = ZOR_FLAG_PLAIN_FILE;
 		flagbits['m'] = ZOR_FLAG_SPACE_MAP;
 		flagbits['z'] = ZOR_FLAG_ZAP;
 		flagbits['A'] = ZOR_FLAG_ALL_TYPES;
 
 		if (argc > 0 && dump_opt['d']) {
 			zopt_object_args = argc;
 			zopt_object_ranges = calloc(zopt_object_args,
 			    sizeof (zopt_object_range_t));
 			for (unsigned i = 0; i < zopt_object_args; i++) {
 				int err;
 				const char *msg = NULL;
 
 				err = parse_object_range(argv[i],
 				    &zopt_object_ranges[i], &msg);
 				if (err != 0)
 					fatal("Bad object or range: '%s': %s\n",
 					    argv[i], msg ?: "");
 			}
 		} else if (argc > 0 && dump_opt['m']) {
 			zopt_metaslab_args = argc;
 			zopt_metaslab = calloc(zopt_metaslab_args,
 			    sizeof (uint64_t));
 			for (unsigned i = 0; i < zopt_metaslab_args; i++) {
 				errno = 0;
 				zopt_metaslab[i] = strtoull(argv[i], NULL, 0);
 				if (zopt_metaslab[i] == 0 && errno != 0)
 					fatal("bad number %s: %s", argv[i],
 					    strerror(errno));
 			}
 		}
 		if (dump_opt['B']) {
 			dump_backup(target, objset_id,
 			    argc > 0 ? argv[0] : NULL);
 		} else if (os != NULL) {
 			dump_objset(os);
 		} else if (zopt_object_args > 0 && !dump_opt['m']) {
 			dump_objset(spa->spa_meta_objset);
 		} else {
 			dump_zpool(spa);
 		}
 	} else {
 		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
 		flagbits['c'] = ZDB_FLAG_CHECKSUM;
 		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
 		flagbits['e'] = ZDB_FLAG_BSWAP;
 		flagbits['g'] = ZDB_FLAG_GBH;
 		flagbits['i'] = ZDB_FLAG_INDIRECT;
 		flagbits['r'] = ZDB_FLAG_RAW;
 		flagbits['v'] = ZDB_FLAG_VERBOSE;
 
 		for (int i = 0; i < argc; i++)
 			zdb_read_block(argv[i], spa);
 	}
 
 	if (dump_opt['k']) {
 		free(checkpoint_pool);
 		if (!target_is_spa)
 			free(checkpoint_target);
 	}
 
 fini:
 	if (spa != NULL)
 		zdb_ddt_cleanup(spa);
 
 	if (os != NULL) {
 		close_objset(os, FTAG);
 	} else if (spa != NULL) {
 		spa_close(spa, FTAG);
 	}
 
 	fuid_table_destroy();
 
 	dump_debug_buffer();
 
 	if (kernel_init_done)
 		kernel_fini();
 
 	if (corruption_found && error == 0)
 		error = 3;
 
 	return (error);
 }
diff --git a/sys/contrib/openzfs/cmd/zhack.c b/sys/contrib/openzfs/cmd/zhack.c
index 2bd3051dce7b..536532a6762d 100644
--- a/sys/contrib/openzfs/cmd/zhack.c
+++ b/sys/contrib/openzfs/cmd/zhack.c
@@ -1,1028 +1,1032 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  */
 
 /*
  * zhack is a debugging tool that can write changes to ZFS pool using libzpool
  * for testing purposes. Altering pools with zhack is unsupported and may
  * result in corrupted pools.
  */
 
 #include <zfs_prop.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 #include <sys/stat.h>
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/zfs_znode.h>
 #include <sys/dsl_synctask.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/zfeature.h>
 #include <sys/dmu_tx.h>
 #include <zfeature_common.h>
 #include <libzutil.h>
 
 static importargs_t g_importargs;
 static char *g_pool;
 static boolean_t g_readonly;
 
 typedef enum {
 	ZHACK_REPAIR_OP_UNKNOWN  = 0,
 	ZHACK_REPAIR_OP_CKSUM    = (1 << 0),
 	ZHACK_REPAIR_OP_UNDETACH = (1 << 1)
 } zhack_repair_op_t;
 
 static __attribute__((noreturn)) void
 usage(void)
 {
 	(void) fprintf(stderr,
 	    "Usage: zhack [-c cachefile] [-d dir] <subcommand> <args> ...\n"
 	    "where <subcommand> <args> is one of the following:\n"
 	    "\n");
 
 	(void) fprintf(stderr,
 	    "    feature stat <pool>\n"
 	    "        print information about enabled features\n"
 	    "    feature enable [-r] [-d desc] <pool> <feature>\n"
 	    "        add a new enabled feature to the pool\n"
 	    "        -d <desc> sets the feature's description\n"
 	    "        -r set read-only compatible flag for feature\n"
 	    "    feature ref [-md] <pool> <feature>\n"
 	    "        change the refcount on the given feature\n"
 	    "        -d decrease instead of increase the refcount\n"
 	    "        -m add the feature to the label if increasing refcount\n"
 	    "\n"
 	    "    <feature> : should be a feature guid\n"
 	    "\n"
 	    "    label repair <device>\n"
 	    "        repair labels of a specified device according to options\n"
 	    "        which may be combined to do their functions in one call\n"
 	    "        -c repair corrupted label checksums\n"
 	    "        -u restore the label on a detached device\n"
 	    "\n"
 	    "    <device> : path to vdev\n");
 	exit(1);
 }
 
 
 static __attribute__((format(printf, 3, 4))) __attribute__((noreturn)) void
 fatal(spa_t *spa, const void *tag, const char *fmt, ...)
 {
 	va_list ap;
 
 	if (spa != NULL) {
 		spa_close(spa, tag);
 		(void) spa_export(g_pool, NULL, B_TRUE, B_FALSE);
 	}
 
 	va_start(ap, fmt);
 	(void) fputs("zhack: ", stderr);
 	(void) vfprintf(stderr, fmt, ap);
 	va_end(ap);
 	(void) fputc('\n', stderr);
 
 	exit(1);
 }
 
 static int
 space_delta_cb(dmu_object_type_t bonustype, const void *data,
     zfs_file_info_t *zoi)
 {
 	(void) data, (void) zoi;
 
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (ENOENT);
 	(void) fprintf(stderr, "modifying object that needs user accounting");
 	abort();
 }
 
 /*
  * Target is the dataset whose pool we want to open.
  */
 static void
 zhack_import(char *target, boolean_t readonly)
 {
 	nvlist_t *config;
 	nvlist_t *props;
 	int error;
 
 	kernel_init(readonly ? SPA_MODE_READ :
 	    (SPA_MODE_READ | SPA_MODE_WRITE));
 
 	dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb);
 
 	g_readonly = readonly;
 	g_importargs.can_be_active = readonly;
 	g_pool = strdup(target);
 
 	libpc_handle_t lpch = {
 		.lpc_lib_handle = NULL,
 		.lpc_ops = &libzpool_config_ops,
 		.lpc_printerr = B_TRUE
 	};
 	error = zpool_find_config(&lpch, target, &config, &g_importargs);
 	if (error)
 		fatal(NULL, FTAG, "cannot import '%s'", target);
 
 	props = NULL;
 	if (readonly) {
 		VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0));
 		VERIFY0(nvlist_add_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_READONLY), 1));
 	}
 
 	zfeature_checks_disable = B_TRUE;
 	error = spa_import(target, config, props,
 	    (readonly ?  ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL));
 	fnvlist_free(config);
 	zfeature_checks_disable = B_FALSE;
 	if (error == EEXIST)
 		error = 0;
 
 	if (error)
 		fatal(NULL, FTAG, "can't import '%s': %s", target,
 		    strerror(error));
 }
 
 static void
 zhack_spa_open(char *target, boolean_t readonly, const void *tag, spa_t **spa)
 {
 	int err;
 
 	zhack_import(target, readonly);
 
 	zfeature_checks_disable = B_TRUE;
 	err = spa_open(target, spa, tag);
 	zfeature_checks_disable = B_FALSE;
 
 	if (err != 0)
 		fatal(*spa, FTAG, "cannot open '%s': %s", target,
 		    strerror(err));
 	if (spa_version(*spa) < SPA_VERSION_FEATURES) {
 		fatal(*spa, FTAG, "'%s' has version %d, features not enabled",
 		    target, (int)spa_version(*spa));
 	}
 }
 
 static void
 dump_obj(objset_t *os, uint64_t obj, const char *name)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_long_alloc();
 
 	(void) printf("%s_obj:\n", name);
 
 	for (zap_cursor_init(&zc, os, obj);
 	    zap_cursor_retrieve(&zc, za) == 0;
 	    zap_cursor_advance(&zc)) {
 		if (za->za_integer_length == 8) {
 			ASSERT(za->za_num_integers == 1);
 			(void) printf("\t%s = %llu\n",
 			    za->za_name, (u_longlong_t)za->za_first_integer);
 		} else {
 			ASSERT(za->za_integer_length == 1);
 			char val[1024];
 			VERIFY0(zap_lookup(os, obj, za->za_name,
 			    1, sizeof (val), val));
 			(void) printf("\t%s = %s\n", za->za_name, val);
 		}
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(za);
 }
 
 static void
 dump_mos(spa_t *spa)
 {
 	nvlist_t *nv = spa->spa_label_features;
 	nvpair_t *pair;
 
 	(void) printf("label config:\n");
 	for (pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(nv, pair)) {
 		(void) printf("\t%s\n", nvpair_name(pair));
 	}
 }
 
 static void
 zhack_do_feature_stat(int argc, char **argv)
 {
 	spa_t *spa;
 	objset_t *os;
 	char *target;
 
 	argc--;
 	argv++;
 
 	if (argc < 1) {
 		(void) fprintf(stderr, "error: missing pool name\n");
 		usage();
 	}
 	target = argv[0];
 
 	zhack_spa_open(target, B_TRUE, FTAG, &spa);
 	os = spa->spa_meta_objset;
 
 	dump_obj(os, spa->spa_feat_for_read_obj, "for_read");
 	dump_obj(os, spa->spa_feat_for_write_obj, "for_write");
 	dump_obj(os, spa->spa_feat_desc_obj, "descriptions");
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg");
 	}
 	dump_mos(spa);
 
 	spa_close(spa, FTAG);
 }
 
 static void
 zhack_feature_enable_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zfeature_info_t *feature = arg;
 
 	feature_enable_sync(spa, feature, tx);
 
 	spa_history_log_internal(spa, "zhack enable feature", tx,
 	    "name=%s flags=%u",
 	    feature->fi_guid, feature->fi_flags);
 }
 
 static void
 zhack_do_feature_enable(int argc, char **argv)
 {
 	int c;
 	char *desc, *target;
 	spa_t *spa;
 	objset_t *mos;
 	zfeature_info_t feature;
 	const spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
 
 	/*
 	 * Features are not added to the pool's label until their refcounts
 	 * are incremented, so fi_mos can just be left as false for now.
 	 */
 	desc = NULL;
 	feature.fi_uname = "zhack";
 	feature.fi_flags = 0;
 	feature.fi_depends = nodeps;
 	feature.fi_feature = SPA_FEATURE_NONE;
 
 	optind = 1;
 	while ((c = getopt(argc, argv, "+rd:")) != -1) {
 		switch (c) {
 		case 'r':
 			feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
 			break;
 		case 'd':
 			if (desc != NULL)
 				free(desc);
 			desc = strdup(optarg);
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	if (desc == NULL)
 		desc = strdup("zhack injected");
 	feature.fi_desc = desc;
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2) {
 		(void) fprintf(stderr, "error: missing feature or pool name\n");
 		usage();
 	}
 	target = argv[0];
 	feature.fi_guid = argv[1];
 
 	if (!zfeature_is_valid_guid(feature.fi_guid))
 		fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
 
 	zhack_spa_open(target, B_FALSE, FTAG, &spa);
 	mos = spa->spa_meta_objset;
 
 	if (zfeature_is_supported(feature.fi_guid))
 		fatal(spa, FTAG, "'%s' is a real feature, will not enable",
 		    feature.fi_guid);
 	if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid))
 		fatal(spa, FTAG, "feature already enabled: %s",
 		    feature.fi_guid);
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 	    zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL));
 
 	spa_close(spa, FTAG);
 
 	free(desc);
 }
 
 static void
 feature_incr_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zfeature_info_t *feature = arg;
 	uint64_t refcount;
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
 	feature_sync(spa, feature, refcount + 1, tx);
 	spa_history_log_internal(spa, "zhack feature incr", tx,
 	    "name=%s", feature->fi_guid);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 static void
 feature_decr_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	zfeature_info_t *feature = arg;
 	uint64_t refcount;
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount));
 	feature_sync(spa, feature, refcount - 1, tx);
 	spa_history_log_internal(spa, "zhack feature decr", tx,
 	    "name=%s", feature->fi_guid);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 static void
 zhack_do_feature_ref(int argc, char **argv)
 {
 	int c;
 	char *target;
 	boolean_t decr = B_FALSE;
 	spa_t *spa;
 	objset_t *mos;
 	zfeature_info_t feature;
 	const spa_feature_t nodeps[] = { SPA_FEATURE_NONE };
 
 	/*
 	 * fi_desc does not matter here because it was written to disk
 	 * when the feature was enabled, but we need to properly set the
 	 * feature for read or write based on the information we read off
 	 * disk later.
 	 */
 	feature.fi_uname = "zhack";
 	feature.fi_flags = 0;
 	feature.fi_desc = NULL;
 	feature.fi_depends = nodeps;
 	feature.fi_feature = SPA_FEATURE_NONE;
 
 	optind = 1;
 	while ((c = getopt(argc, argv, "+md")) != -1) {
 		switch (c) {
 		case 'm':
 			feature.fi_flags |= ZFEATURE_FLAG_MOS;
 			break;
 		case 'd':
 			decr = B_TRUE;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 	argc -= optind;
 	argv += optind;
 
 	if (argc < 2) {
 		(void) fprintf(stderr, "error: missing feature or pool name\n");
 		usage();
 	}
 	target = argv[0];
 	feature.fi_guid = argv[1];
 
 	if (!zfeature_is_valid_guid(feature.fi_guid))
 		fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid);
 
 	zhack_spa_open(target, B_FALSE, FTAG, &spa);
 	mos = spa->spa_meta_objset;
 
 	if (zfeature_is_supported(feature.fi_guid)) {
 		fatal(spa, FTAG,
 		    "'%s' is a real feature, will not change refcount",
 		    feature.fi_guid);
 	}
 
 	if (0 == zap_contains(mos, spa->spa_feat_for_read_obj,
 	    feature.fi_guid)) {
 		feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT;
 	} else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj,
 	    feature.fi_guid)) {
 		feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT;
 	} else {
 		fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid);
 	}
 
 	if (decr) {
 		uint64_t count;
 		if (feature_get_refcount_from_disk(spa, &feature,
 		    &count) == 0 && count == 0) {
 			fatal(spa, FTAG, "feature refcount already 0: %s",
 			    feature.fi_guid);
 		}
 	}
 
 	VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 	    decr ? feature_decr_sync : feature_incr_sync, &feature,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 
 	spa_close(spa, FTAG);
 }
 
 static int
 zhack_do_feature(int argc, char **argv)
 {
 	char *subcommand;
 
 	argc--;
 	argv++;
 	if (argc == 0) {
 		(void) fprintf(stderr,
 		    "error: no feature operation specified\n");
 		usage();
 	}
 
 	subcommand = argv[0];
 	if (strcmp(subcommand, "stat") == 0) {
 		zhack_do_feature_stat(argc, argv);
 	} else if (strcmp(subcommand, "enable") == 0) {
 		zhack_do_feature_enable(argc, argv);
 	} else if (strcmp(subcommand, "ref") == 0) {
 		zhack_do_feature_ref(argc, argv);
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
 		usage();
 	}
 
 	return (0);
 }
 
 #define	ASHIFT_UBERBLOCK_SHIFT(ashift)	\
 	MIN(MAX(ashift, UBERBLOCK_SHIFT), \
 	MAX_UBERBLOCK_SHIFT)
 #define	ASHIFT_UBERBLOCK_SIZE(ashift) \
 	(1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift))
 
 #define	REPAIR_LABEL_STATUS_CKSUM (1 << 0)
 #define	REPAIR_LABEL_STATUS_UB    (1 << 1)
 
 static int
 zhack_repair_read_label(const int fd, vdev_label_t *vl,
     const uint64_t label_offset, const int l)
 {
 	const int err = pread64(fd, vl, sizeof (vdev_label_t), label_offset);
 
 	if (err == -1) {
 		(void) fprintf(stderr,
 		    "error: cannot read label %d: %s\n",
 		    l, strerror(errno));
 		return (err);
 	} else if (err != sizeof (vdev_label_t)) {
 		(void) fprintf(stderr,
 		    "error: bad label %d read size\n", l);
 		return (err);
 	}
 
 	return (0);
 }
 
 static void
 zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset,
     const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum)
 {
 	zio_cksum_t verifier;
 	zio_cksum_t current_cksum;
 	zio_checksum_info_t *ci;
 	abd_t *abd;
 
 	ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0);
 
 	if (byteswap)
 		byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
 	current_cksum = eck->zec_cksum;
 	eck->zec_cksum = verifier;
 
 	ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
 	abd = abd_get_from_buf(data, abdsize);
 	ci->ci_func[byteswap](abd, abdsize, NULL, cksum);
 	abd_free(abd);
 
 	eck->zec_cksum = current_cksum;
 }
 
 static int
 zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys,
     const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg,
     uint64_t *ashift)
 {
 	int err;
 
 	if (ub->ub_txg != 0) {
 		(void) fprintf(stderr,
 		    "error: label %d: UB TXG of 0 expected, but got %"
 		    PRIu64 "\n",
 		    l, ub->ub_txg);
 		(void) fprintf(stderr, "It would appear the device was not "
 		    "properly removed.\n");
 		return (1);
 	}
 
 	for (int i = 0; i < cfg_keys_len; i++) {
 		uint64_t val;
 		err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val);
 		if (err) {
 			(void) fprintf(stderr,
 			    "error: label %d, %d: "
 			    "cannot find nvlist key %s\n",
 			    l, i, cfg_keys[i]);
 			return (err);
 		}
 	}
 
 	err = nvlist_lookup_nvlist(cfg,
 	    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg);
 	if (err) {
 		(void) fprintf(stderr,
 		    "error: label %d: cannot find nvlist key %s\n",
 		    l, ZPOOL_CONFIG_VDEV_TREE);
 		return (err);
 	}
 
 	err = nvlist_lookup_uint64(vdev_tree_cfg,
 	    ZPOOL_CONFIG_ASHIFT, ashift);
 	if (err) {
 		(void) fprintf(stderr,
 		    "error: label %d: cannot find nvlist key %s\n",
 		    l, ZPOOL_CONFIG_ASHIFT);
 		return (err);
 	}
 
 	if (*ashift == 0) {
 		(void) fprintf(stderr,
 		    "error: label %d: nvlist key %s is zero\n",
 		    l, ZPOOL_CONFIG_ASHIFT);
 		return (err);
 	}
 
 	return (0);
 }
 
 static int
 zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l)
 {
 	/*
 	 * Uberblock root block pointer has valid birth TXG.
 	 * Copying it to the label NVlist
 	 */
 	if (BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp) != 0) {
 		const uint64_t txg = BP_GET_LOGICAL_BIRTH(&ub->ub_rootbp);
 		ub->ub_txg = txg;
 
 		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: "
 			    "Failed to remove pool creation TXG\n",
 			    l);
 			return (1);
 		}
 
 		if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: Failed to remove pool TXG to "
 			    "be replaced.\n",
 			    l);
 			return (1);
 		}
 
 		if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: "
 			    "Failed to add pool TXG of %" PRIu64 "\n",
 			    l, txg);
 			return (1);
 		}
 	}
 
 	return (0);
 }
 
 static boolean_t
 zhack_repair_write_label(const int l, const int fd, const int byteswap,
     void *data, zio_eck_t *eck, const uint64_t offset, const uint64_t abdsize)
 {
 	zio_cksum_t actual_cksum;
 	zhack_repair_calc_cksum(byteswap, data, offset, abdsize, eck,
 	    &actual_cksum);
 	zio_cksum_t expected_cksum = eck->zec_cksum;
 	ssize_t err;
 
 	if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (B_FALSE);
 
 	eck->zec_cksum = actual_cksum;
 
 	err = pwrite64(fd, data, abdsize, offset);
 	if (err == -1) {
 		(void) fprintf(stderr, "error: cannot write label %d: %s\n",
 		    l, strerror(errno));
 		return (B_FALSE);
 	} else if (err != abdsize) {
 		(void) fprintf(stderr, "error: bad write size label %d\n", l);
 		return (B_FALSE);
 	} else {
 		(void) fprintf(stderr,
 		    "label %d: wrote %" PRIu64 " bytes at offset %" PRIu64 "\n",
 		    l, abdsize, offset);
 	}
 
 	return (B_TRUE);
 }
 
 static void
 zhack_repair_write_uberblock(vdev_label_t *vl, const int l,
     const uint64_t ashift, const int fd, const int byteswap,
     const uint64_t label_offset, uint32_t *labels_repaired)
 {
 	void *ub_data =
 	    (char *)vl + offsetof(vdev_label_t, vl_uberblock);
 	zio_eck_t *ub_eck =
 	    (zio_eck_t *)
 	    ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1;
 
 	if (ub_eck->zec_magic != 0) {
 		(void) fprintf(stderr,
 		    "error: label %d: "
 		    "Expected Uberblock checksum magic number to "
 		    "be 0, but got %" PRIu64 "\n",
 		    l, ub_eck->zec_magic);
 		(void) fprintf(stderr, "It would appear there's already "
 		    "a checksum for the uberblock.\n");
 		return;
 	}
 
 
 	ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
 
 	if (zhack_repair_write_label(l, fd, byteswap,
 	    ub_data, ub_eck,
 	    label_offset + offsetof(vdev_label_t, vl_uberblock),
 	    ASHIFT_UBERBLOCK_SIZE(ashift)))
 			labels_repaired[l] |= REPAIR_LABEL_STATUS_UB;
 }
 
 static void
 zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum)
 {
 	(void) fprintf(stream,
 	    "%016llx:%016llx:%016llx:%016llx",
 	    (u_longlong_t)cksum->zc_word[0],
 	    (u_longlong_t)cksum->zc_word[1],
 	    (u_longlong_t)cksum->zc_word[2],
 	    (u_longlong_t)cksum->zc_word[3]);
 }
 
 static int
 zhack_repair_test_cksum(const int byteswap, void *vdev_data,
     zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l)
 {
 	const zio_cksum_t expected_cksum = vdev_eck->zec_cksum;
 	zio_cksum_t actual_cksum;
 	zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset,
 	    VDEV_PHYS_SIZE, vdev_eck, &actual_cksum);
 	const uint64_t expected_magic = byteswap ?
 	    BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC;
 	const uint64_t actual_magic = vdev_eck->zec_magic;
 	int err = 0;
 	if (actual_magic != expected_magic) {
 		(void) fprintf(stderr, "error: label %d: "
 		    "Expected "
 		    "the nvlist checksum magic number to not be %"
 		    PRIu64 " not %" PRIu64 "\n",
 		    l, expected_magic, actual_magic);
 		err = ECKSUM;
 	}
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
 		(void) fprintf(stderr, "error: label %d: "
 		    "Expected the nvlist checksum to be ", l);
 		(void) zhack_repair_print_cksum(stderr,
 		    &expected_cksum);
 		(void) fprintf(stderr, " not ");
 		zhack_repair_print_cksum(stderr, &actual_cksum);
 		(void) fprintf(stderr, "\n");
 		err = ECKSUM;
 	}
 	return (err);
 }
 
 static void
 zhack_repair_one_label(const zhack_repair_op_t op, const int fd,
     vdev_label_t *vl, const uint64_t label_offset, const int l,
     uint32_t *labels_repaired)
 {
 	ssize_t err;
 	uberblock_t *ub = (uberblock_t *)vl->vl_uberblock;
 	void *vdev_data =
 	    (char *)vl + offsetof(vdev_label_t, vl_vdev_phys);
 	zio_eck_t *vdev_eck =
 	    (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1;
 	const uint64_t vdev_phys_offset =
 	    label_offset + offsetof(vdev_label_t, vl_vdev_phys);
 	const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION,
 	    ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID };
 	nvlist_t *cfg;
 	nvlist_t *vdev_tree_cfg = NULL;
 	uint64_t ashift;
 	int byteswap;
 
 	err = zhack_repair_read_label(fd, vl, label_offset, l);
 	if (err)
 		return;
 
 	if (vdev_eck->zec_magic == 0) {
 		(void) fprintf(stderr, "error: label %d: "
 		    "Expected the nvlist checksum magic number to not be zero"
 		    "\n",
 		    l);
 		(void) fprintf(stderr, "There should already be a checksum "
 		    "for the label.\n");
 		return;
 	}
 
 	byteswap =
 	    (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC));
 
 	if (byteswap) {
 		byteswap_uint64_array(&vdev_eck->zec_cksum,
 		    sizeof (zio_cksum_t));
 		vdev_eck->zec_magic = BSWAP_64(vdev_eck->zec_magic);
 	}
 
 	if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 &&
 	    zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck,
 	    vdev_phys_offset, l) != 0) {
 		(void) fprintf(stderr, "It would appear checksums are "
 		    "corrupted. Try zhack repair label -c <device>\n");
 		return;
 	}
 
 	err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist,
 	    VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0);
 	if (err) {
 		(void) fprintf(stderr,
 		    "error: cannot unpack nvlist label %d\n", l);
 		return;
 	}
 
 	err = zhack_repair_check_label(ub,
 	    l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift);
 	if (err)
 		return;
 
 	if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) {
 		char *buf;
 		size_t buflen;
 
 		err = zhack_repair_undetach(ub, cfg, l);
 		if (err)
 			return;
 
 		buf = vl->vl_vdev_phys.vp_nvlist;
 		buflen = VDEV_PHYS_SIZE - sizeof (zio_eck_t);
 		if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
 			(void) fprintf(stderr,
 			    "error: label %d: Failed to pack nvlist\n", l);
 			return;
 		}
 
 		zhack_repair_write_uberblock(vl,
 		    l, ashift, fd, byteswap, label_offset, labels_repaired);
 	}
 
 	if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck,
 	    vdev_phys_offset, VDEV_PHYS_SIZE))
 			labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM;
 
 	fsync(fd);
 }
 
 static const char *
 zhack_repair_label_status(const uint32_t label_status,
     const uint32_t to_check)
 {
 	return ((label_status & to_check) != 0 ? "repaired" : "skipped");
 }
 
 static int
 zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv)
 {
 	uint32_t labels_repaired[VDEV_LABELS] = {0};
 	vdev_label_t labels[VDEV_LABELS] = {{{0}}};
 	struct stat64 st;
 	int fd;
 	off_t filesize;
 	uint32_t repaired = 0;
 
 	abd_init();
 
 	if (argc < 1) {
 		(void) fprintf(stderr, "error: missing device\n");
 		usage();
 	}
 
 	if ((fd = open(argv[0], O_RDWR)) == -1)
 		fatal(NULL, FTAG, "cannot open '%s': %s", argv[0],
 		    strerror(errno));
 
 	if (fstat64_blk(fd, &st) != 0)
 		fatal(NULL, FTAG, "cannot stat '%s': %s", argv[0],
 		    strerror(errno));
 
 	filesize = st.st_size;
 	(void) fprintf(stderr, "Calculated filesize to be %jd\n",
 	    (intmax_t)filesize);
 
 	if (filesize % sizeof (vdev_label_t) != 0)
 		filesize =
 		    (filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		zhack_repair_one_label(op, fd, &labels[l],
 		    vdev_label_offset(filesize, l, 0), l, labels_repaired);
 	}
 
 	close(fd);
 
 	abd_fini();
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		const uint32_t lr = labels_repaired[l];
 		(void) printf("label %d: ", l);
 		(void) printf("uberblock: %s ",
 		    zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_UB));
 		(void) printf("checksum: %s\n",
 		    zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_CKSUM));
 		repaired |= lr;
 	}
 
 	if (repaired > 0)
 		return (0);
 
 	return (1);
 }
 
 static int
 zhack_do_label_repair(int argc, char **argv)
 {
 	zhack_repair_op_t op = ZHACK_REPAIR_OP_UNKNOWN;
 	int c;
 
 	optind = 1;
 	while ((c = getopt(argc, argv, "+cu")) != -1) {
 		switch (c) {
 		case 'c':
 			op |= ZHACK_REPAIR_OP_CKSUM;
 			break;
 		case 'u':
 			op |= ZHACK_REPAIR_OP_UNDETACH;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (op == ZHACK_REPAIR_OP_UNKNOWN)
 		op = ZHACK_REPAIR_OP_CKSUM;
 
 	return (zhack_label_repair(op, argc, argv));
 }
 
 static int
 zhack_do_label(int argc, char **argv)
 {
 	char *subcommand;
 	int err;
 
 	argc--;
 	argv++;
 	if (argc == 0) {
 		(void) fprintf(stderr,
 		    "error: no label operation specified\n");
 		usage();
 	}
 
 	subcommand = argv[0];
 	if (strcmp(subcommand, "repair") == 0) {
 		err = zhack_do_label_repair(argc, argv);
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
 		usage();
 	}
 
 	return (err);
 }
 
 #define	MAX_NUM_PATHS 1024
 
 int
 main(int argc, char **argv)
 {
 	char *path[MAX_NUM_PATHS];
 	const char *subcommand;
 	int rv = 0;
 	int c;
 
 	g_importargs.path = path;
 
 	dprintf_setup(&argc, argv);
 	zfs_prop_init();
 
 	while ((c = getopt(argc, argv, "+c:d:")) != -1) {
 		switch (c) {
 		case 'c':
 			g_importargs.cachefile = optarg;
 			break;
 		case 'd':
 			assert(g_importargs.paths < MAX_NUM_PATHS);
 			g_importargs.path[g_importargs.paths++] = optarg;
 			break;
 		default:
 			usage();
 			break;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 	optind = 1;
 
 	if (argc == 0) {
 		(void) fprintf(stderr, "error: no command specified\n");
 		usage();
 	}
 
 	subcommand = argv[0];
 
 	if (strcmp(subcommand, "feature") == 0) {
 		rv = zhack_do_feature(argc, argv);
 	} else if (strcmp(subcommand, "label") == 0) {
 		return (zhack_do_label(argc, argv));
 	} else {
 		(void) fprintf(stderr, "error: unknown subcommand: %s\n",
 		    subcommand);
 		usage();
 	}
 
 	if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) {
 		fatal(NULL, FTAG, "pool export failed; "
 		    "changes may not be committed to disk\n");
 	}
 
 	kernel_fini();
 
 	return (rv);
 }
diff --git a/sys/contrib/openzfs/cmd/zpool/Makefile.am b/sys/contrib/openzfs/cmd/zpool/Makefile.am
index 2f962408e5a3..5bb6d8160b18 100644
--- a/sys/contrib/openzfs/cmd/zpool/Makefile.am
+++ b/sys/contrib/openzfs/cmd/zpool/Makefile.am
@@ -1,204 +1,207 @@
 zpool_CFLAGS  = $(AM_CFLAGS)
 zpool_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS)
 
 zpool_CPPFLAGS  = $(AM_CPPFLAGS)
 zpool_CPPFLAGS += -I$(srcdir)/%D%
 
 sbin_PROGRAMS   += zpool
 CPPCHECKTARGETS += zpool
 
 zpool_SOURCES = \
 	%D%/zpool_iter.c \
 	%D%/zpool_main.c \
 	%D%/zpool_util.c \
 	%D%/zpool_util.h \
 	%D%/zpool_vdev.c
 
 if BUILD_FREEBSD
 zpool_SOURCES += \
 	%D%/os/freebsd/zpool_vdev_os.c
 endif
 
 if BUILD_LINUX
 zpool_SOURCES += \
 	%D%/os/linux/zpool_vdev_os.c
 endif
 
 zpool_LDADD = \
 	libzfs.la \
 	libzfs_core.la \
 	libnvpair.la \
 	libuutil.la \
 	libzutil.la
 
 zpool_LDADD += $(LTLIBINTL)
 
 if BUILD_FREEBSD
 zpool_LDADD += -lgeom
 endif
 zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS)
 
 dist_noinst_DATA += %D%/zpool.d/README
 
 SHELLCHECKSCRIPTS += $(dist_zpoolexec_SCRIPTS)
 zpoolexecdir = $(zfsexecdir)/zpool.d
 dist_zpoolexec_SCRIPTS = \
 	%D%/zpool.d/ata_err \
 	%D%/zpool.d/cmd_to \
 	%D%/zpool.d/defect \
 	%D%/zpool.d/dm-deps \
 	%D%/zpool.d/enc \
 	%D%/zpool.d/encdev \
 	%D%/zpool.d/fault_led \
 	%D%/zpool.d/health \
 	%D%/zpool.d/hours_on \
 	%D%/zpool.d/iostat \
 	%D%/zpool.d/iostat-10s \
 	%D%/zpool.d/iostat-1s \
 	%D%/zpool.d/label \
 	%D%/zpool.d/locate_led \
 	%D%/zpool.d/lsblk \
 	%D%/zpool.d/media \
 	%D%/zpool.d/model \
 	%D%/zpool.d/nonmed \
 	%D%/zpool.d/nvme_err \
 	%D%/zpool.d/off_ucor \
 	%D%/zpool.d/pend_sec \
 	%D%/zpool.d/pwr_cyc \
 	%D%/zpool.d/r_proc \
 	%D%/zpool.d/r_ucor \
 	%D%/zpool.d/realloc \
 	%D%/zpool.d/rep_ucor \
 	%D%/zpool.d/serial \
 	%D%/zpool.d/ses \
 	%D%/zpool.d/size \
 	%D%/zpool.d/slot \
 	%D%/zpool.d/smart \
 	%D%/zpool.d/smart_test \
 	%D%/zpool.d/smartx \
 	%D%/zpool.d/temp \
 	%D%/zpool.d/test_ended \
 	%D%/zpool.d/test_progress \
 	%D%/zpool.d/test_status \
 	%D%/zpool.d/test_type \
 	%D%/zpool.d/upath \
 	%D%/zpool.d/vendor \
 	%D%/zpool.d/w_proc \
 	%D%/zpool.d/w_ucor
 
 zpoolconfdefaults = \
 	dm-deps \
 	enc \
 	encdev \
 	fault_led \
 	iostat \
 	iostat-1s \
 	iostat-10s \
 	label \
 	locate_led \
 	lsblk \
 	media \
 	model \
 	serial \
 	ses \
 	size \
 	slot \
 	smart \
 	smartx \
 	temp \
 	health \
 	r_proc \
 	w_proc \
 	r_ucor \
 	w_ucor \
 	nonmed \
 	defect \
 	hours_on \
 	realloc \
 	rep_ucor \
 	cmd_to \
 	pend_sec \
 	off_ucor \
 	ata_err \
 	nvme_err \
 	pwr_cyc \
 	upath \
 	vendor \
 	smart_test \
 	test_type \
 	test_status \
 	test_progress \
 	test_ended
 
 zpoolcompatdir = $(pkgdatadir)/compatibility.d
 dist_zpoolcompat_DATA = \
 	%D%/compatibility.d/compat-2018 \
 	%D%/compatibility.d/compat-2019 \
 	%D%/compatibility.d/compat-2020 \
 	%D%/compatibility.d/compat-2021 \
 	%D%/compatibility.d/freebsd-11.0 \
 	%D%/compatibility.d/freebsd-11.2 \
 	%D%/compatibility.d/freebsd-11.3 \
 	%D%/compatibility.d/freenas-9.10.2 \
 	%D%/compatibility.d/grub2-2.06 \
 	%D%/compatibility.d/grub2-2.12 \
 	%D%/compatibility.d/openzfs-2.0-freebsd \
 	%D%/compatibility.d/openzfs-2.0-linux \
 	%D%/compatibility.d/openzfs-2.1-freebsd \
 	%D%/compatibility.d/openzfs-2.1-linux \
 	%D%/compatibility.d/openzfs-2.2 \
 	%D%/compatibility.d/openzfs-2.3 \
+	%D%/compatibility.d/openzfs-2.4 \
 	%D%/compatibility.d/openzfsonosx-1.7.0 \
 	%D%/compatibility.d/openzfsonosx-1.8.1 \
 	%D%/compatibility.d/openzfsonosx-1.9.3 \
 	%D%/compatibility.d/zol-0.6.1 \
 	%D%/compatibility.d/zol-0.6.4 \
 	%D%/compatibility.d/zol-0.6.5 \
 	%D%/compatibility.d/zol-0.7 \
 	%D%/compatibility.d/zol-0.8
 
 # canonical <- alias symbolic link pairs
 # eg: "2018" is a link to "compat-2018"
 zpoolcompatlinks = \
 	"compat-2018		2018" \
 	"compat-2019		2019" \
 	"compat-2020		2020" \
 	"compat-2021		2021" \
 	"freebsd-11.0		freebsd-11.1" \
 	"freebsd-11.0		freenas-11.0" \
 	"freebsd-11.2		freenas-11.2" \
 	"freebsd-11.3		freebsd-11.4" \
 	"freebsd-11.3		freebsd-12.0" \
 	"freebsd-11.3		freebsd-12.1" \
 	"freebsd-11.3		freebsd-12.2" \
 	"freebsd-11.3		freebsd-12.3" \
 	"freebsd-11.3		freebsd-12.4" \
 	"grub2-2.12		grub2" \
 	"openzfs-2.1-freebsd	freebsd-13.0" \
 	"openzfs-2.1-freebsd	freebsd-13.1" \
 	"openzfs-2.1-freebsd	freebsd-13.2" \
 	"freebsd-11.3		freenas-11.3" \
 	"freenas-11.0		freenas-11.1" \
 	"openzfsonosx-1.9.3	openzfsonosx-1.9.4" \
 	"openzfs-2.0-freebsd	truenas-12.0" \
 	"zol-0.7		ubuntu-18.04" \
 	"zol-0.8		ubuntu-20.04" \
 	"openzfs-2.1-linux	ubuntu-22.04" \
 	"openzfs-2.2		openzfs-2.2-linux" \
 	"openzfs-2.2		openzfs-2.2-freebsd" \
 	"openzfs-2.3		openzfs-2.3-linux" \
-	"openzfs-2.3		openzfs-2.3-freebsd"
+	"openzfs-2.3		openzfs-2.3-freebsd" \
+	"openzfs-2.4		openzfs-2.4-linux" \
+	"openzfs-2.4		openzfs-2.4-freebsd"
 
 zpoolconfdir = $(sysconfdir)/zfs/zpool.d
 INSTALL_DATA_HOOKS += zpool-install-data-hook
 zpool-install-data-hook:
 	$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
 	set -x; for f in $(zpoolconfdefaults); do \
 	  [ -f "$(DESTDIR)$(zpoolconfdir)/$${f}" ] || \
 	  [ -L "$(DESTDIR)$(zpoolconfdir)/$${f}" ] || \
 	    $(LN_S) "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \
 	done
 	set -x; printf '%s\n' $(zpoolcompatlinks) | \
 	while read -r canon alias; do \
 		$(LN_S) -f "$${canon}" "$(DESTDIR)$(zpoolcompatdir)/$${alias}"; \
 	done
diff --git a/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4 b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4
new file mode 100644
index 000000000000..3fbd91014c95
--- /dev/null
+++ b/sys/contrib/openzfs/cmd/zpool/compatibility.d/openzfs-2.4
@@ -0,0 +1,48 @@
+# Features supported by OpenZFS 2.4 on Linux and FreeBSD
+allocation_classes
+async_destroy
+blake3
+block_cloning
+block_cloning_endian
+bookmark_v2
+bookmark_written
+bookmarks
+device_rebuild
+device_removal
+draid
+dynamic_gang_header
+edonr
+embedded_data
+empty_bpobj
+enabled_txg
+encryption
+extensible_dataset
+fast_dedup
+filesystem_limits
+head_errlog
+hole_birth
+large_blocks
+large_dnode
+large_microzap
+livelist
+log_spacemap
+longname
+lz4_compress
+multi_vdev_crash_dump
+obsolete_counts
+physical_rewrite
+project_quota
+raidz_expansion
+redacted_datasets
+redaction_bookmarks
+redaction_list_spill
+resilver_defer
+sha512
+skein
+spacemap_histogram
+spacemap_v2
+userobj_accounting
+vdev_zaps_v2
+zilsaxattr
+zpool_checkpoint
+zstd_compress
diff --git a/sys/contrib/openzfs/cmd/zstream/Makefile.am b/sys/contrib/openzfs/cmd/zstream/Makefile.am
index be3539fe905d..80ef1ea7ca11 100644
--- a/sys/contrib/openzfs/cmd/zstream/Makefile.am
+++ b/sys/contrib/openzfs/cmd/zstream/Makefile.am
@@ -1,23 +1,24 @@
 zstream_CPPFLAGS = $(AM_CPPFLAGS) $(LIBZPOOL_CPPFLAGS)
 
 sbin_PROGRAMS   += zstream
 CPPCHECKTARGETS += zstream
 
 zstream_SOURCES = \
 	%D%/zstream.c \
 	%D%/zstream.h \
 	%D%/zstream_decompress.c \
 	%D%/zstream_dump.c \
 	%D%/zstream_recompress.c \
 	%D%/zstream_redup.c \
 	%D%/zstream_token.c
 
 zstream_LDADD = \
 	libzfs.la \
 	libzfs_core.la \
 	libzpool.la \
 	libnvpair.la
 
-PHONY += install-exec-hook
-install-exec-hook:
+cmd-zstream-install-exec-hook:
 	cd $(DESTDIR)$(sbindir) && $(LN_S) -f zstream zstreamdump
+
+INSTALL_EXEC_HOOKS += cmd-zstream-install-exec-hook
diff --git a/sys/contrib/openzfs/config/always-arch.m4 b/sys/contrib/openzfs/config/always-arch.m4
index 9f413eeddf95..1ee6099ca8b2 100644
--- a/sys/contrib/openzfs/config/always-arch.m4
+++ b/sys/contrib/openzfs/config/always-arch.m4
@@ -1,41 +1,58 @@
 dnl #
 dnl # Set the target cpu architecture.  This allows the
 dnl # following syntax to be used in a Makefile.am.
 dnl #
 dnl # if TARGET_CPU_POWERPC
 dnl # ...
 dnl # else
 dnl # ...
 dnl # endif
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [
 	case $target_cpu in
 	i?86)
 		TARGET_CPU=i386
 		;;
 	amd64|x86_64)
 		TARGET_CPU=x86_64
 		;;
 	powerpc*)
 		TARGET_CPU=powerpc
 		;;
 	aarch64*)
 		TARGET_CPU=aarch64
 		;;
 	armv*)
 		TARGET_CPU=arm
 		;;
 	sparc64)
 		TARGET_CPU=sparc64
 		;;
 	*)
 		TARGET_CPU=$target_cpu
 		;;
 	esac
 
 	AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64)
 	AM_CONDITIONAL([TARGET_CPU_X86_64],  test $TARGET_CPU = x86_64)
 	AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc)
 	AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64)
 	AM_CONDITIONAL([TARGET_CPU_ARM],     test $TARGET_CPU = arm)
 ])
+dnl #
+dnl # Check for conflicting environment variables
+dnl #
+dnl # If ARCH env variable is set up, then kernel Makefile in the /usr/src/kernel
+dnl # can misbehave during the zfs ./configure test of the module compilation.
+AC_DEFUN([ZFS_AC_CONFIG_CHECK_ARCH_VAR], [
+	AC_MSG_CHECKING([for conflicting environment variables])
+	if test -n "$ARCH"; then
+		AC_MSG_RESULT([warning])
+		AC_MSG_WARN(m4_normalize([ARCH environment variable is set to "$ARCH".
+    This can cause build kernel modules support check failure.
+    Please unset it.]))
+	else
+		AC_MSG_RESULT([done])
+	fi
+])
+
diff --git a/sys/contrib/openzfs/config/always-compiler-options.m4 b/sys/contrib/openzfs/config/always-compiler-options.m4
index 6383b12506ee..37fa079e0f4c 100644
--- a/sys/contrib/openzfs/config/always-compiler-options.m4
+++ b/sys/contrib/openzfs/config/always-compiler-options.m4
@@ -1,357 +1,379 @@
 dnl #
 dnl # Enabled -fsanitize=address if supported by $CC.
 dnl #
 dnl # LDFLAGS needs -fsanitize=address at all times so libraries compiled with
 dnl # it will be linked successfully. CFLAGS will vary by binary being built.
 dnl #
 dnl # The ASAN_OPTIONS environment variable can be used to further control
 dnl # the behavior of binaries and libraries build with -fsanitize=address.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_ASAN], [
 	AC_MSG_CHECKING([whether to build with -fsanitize=address support])
 	AC_ARG_ENABLE([asan],
 		[AS_HELP_STRING([--enable-asan],
 		[Enable -fsanitize=address support  @<:@default=no@:>@])],
 		[],
 		[enable_asan=no])
 
 	AM_CONDITIONAL([ASAN_ENABLED], [test x$enable_asan = xyes])
 	AC_SUBST([ASAN_ENABLED], [$enable_asan])
 	AC_MSG_RESULT($enable_asan)
 
 	AS_IF([ test "$enable_asan" = "yes" ], [
 		AC_MSG_CHECKING([whether $CC supports -fsanitize=address])
 		saved_cflags="$CFLAGS"
 		CFLAGS="$CFLAGS -Werror -fsanitize=address"
 		AC_LINK_IFELSE([
 			AC_LANG_SOURCE([[ int main() { return 0; } ]])
 		], [
 			ASAN_CFLAGS="-fsanitize=address"
 			ASAN_LDFLAGS="-fsanitize=address"
 			ASAN_ZFS="_with_asan"
 			AC_MSG_RESULT([yes])
 		], [
 			AC_MSG_ERROR([$CC does not support -fsanitize=address])
 		])
 		CFLAGS="$saved_cflags"
 	], [
 		ASAN_CFLAGS=""
 		ASAN_LDFLAGS=""
 		ASAN_ZFS="_without_asan"
 	])
 
 	AC_SUBST([ASAN_CFLAGS])
 	AC_SUBST([ASAN_LDFLAGS])
 	AC_SUBST([ASAN_ZFS])
 ])
 
 dnl #
 dnl # Enabled -fsanitize=undefined if supported by cc.
 dnl #
 dnl # LDFLAGS needs -fsanitize=undefined at all times so libraries compiled with
 dnl # it will be linked successfully. CFLAGS will vary by binary being built.
 dnl #
 dnl # The UBSAN_OPTIONS environment variable can be used to further control
 dnl # the behavior of binaries and libraries build with -fsanitize=undefined.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_UBSAN], [
 	AC_MSG_CHECKING([whether to build with -fsanitize=undefined support])
 	AC_ARG_ENABLE([ubsan],
 		[AS_HELP_STRING([--enable-ubsan],
 		[Enable -fsanitize=undefined support  @<:@default=no@:>@])],
 		[],
 		[enable_ubsan=no])
 
 	AM_CONDITIONAL([UBSAN_ENABLED], [test x$enable_ubsan = xyes])
 	AC_SUBST([UBSAN_ENABLED], [$enable_ubsan])
 	AC_MSG_RESULT($enable_ubsan)
 
 	AS_IF([ test "$enable_ubsan" = "yes" ], [
 		AC_MSG_CHECKING([whether $CC supports -fsanitize=undefined])
 		saved_cflags="$CFLAGS"
 		CFLAGS="$CFLAGS -Werror -fsanitize=undefined"
 		AC_LINK_IFELSE([
 			AC_LANG_SOURCE([[ int main() { return 0; } ]])
 		], [
 			UBSAN_CFLAGS="-fsanitize=undefined"
 			UBSAN_LDFLAGS="-fsanitize=undefined"
 			UBSAN_ZFS="_with_ubsan"
 			AC_MSG_RESULT([yes])
 		], [
 			AC_MSG_ERROR([$CC does not support -fsanitize=undefined])
 		])
 		CFLAGS="$saved_cflags"
 	], [
 		UBSAN_CFLAGS=""
 		UBSAN_LDFLAGS=""
 		UBSAN_ZFS="_without_ubsan"
 	])
 
 	AC_SUBST([UBSAN_CFLAGS])
 	AC_SUBST([UBSAN_LDFLAGS])
 	AC_SUBST([UBSAN_ZFS])
 ])
 
 dnl #
 dnl # Check if cc supports -Wframe-larger-than=<size> option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN], [
 	AC_MSG_CHECKING([whether $CC supports -Wframe-larger-than=<size>])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Wframe-larger-than=4096"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		FRAME_LARGER_THAN="-Wframe-larger-than=4096"
 		AC_MSG_RESULT([yes])
 	], [
 		FRAME_LARGER_THAN=""
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([FRAME_LARGER_THAN])
 ])
 
 dnl #
 dnl # Check if cc supports -Wno-format-truncation option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION], [
 	AC_MSG_CHECKING([whether $CC supports -Wno-format-truncation])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Wno-format-truncation"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		NO_FORMAT_TRUNCATION=-Wno-format-truncation
 		AC_MSG_RESULT([yes])
 	], [
 		NO_FORMAT_TRUNCATION=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([NO_FORMAT_TRUNCATION])
 ])
 
 dnl #
 dnl # Check if cc supports -Wno-format-zero-length option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH], [
 	AC_MSG_CHECKING([whether $CC supports -Wno-format-zero-length])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Wno-format-zero-length"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		NO_FORMAT_ZERO_LENGTH=-Wno-format-zero-length
 		AC_MSG_RESULT([yes])
 	], [
 		NO_FORMAT_ZERO_LENGTH=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([NO_FORMAT_ZERO_LENGTH])
 ])
 
+dnl #
+dnl # Check if kernel cc supports -Wno-format-zero-length option.
+dnl #
+AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_FORMAT_ZERO_LENGTH], [
+	saved_cc="$CC"
+	AS_IF(
+		[ test -n "$KERNEL_CC" ], [ CC="$KERNEL_CC" ],
+		[ test -n "$KERNEL_LLVM" ], [ CC="clang" ],
+		[ CC="gcc" ]
+	)
+	AC_MSG_CHECKING([whether $CC supports -Wno-format-zero-length])
+
+	saved_flags="$CFLAGS"
+	CFLAGS="$CFLAGS -Werror -Wno-format-zero-length"
+
+	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
+		KERNEL_NO_FORMAT_ZERO_LENGTH=-Wno-format-zero-length
+		AC_MSG_RESULT([yes])
+	], [
+		KERNEL_NO_FORMAT_ZERO_LENGTH=
+		AC_MSG_RESULT([no])
+	])
+
+	CC="$saved_cc"
+	CFLAGS="$saved_flags"
+	AC_SUBST([KERNEL_NO_FORMAT_ZERO_LENGTH])
+])
+
 dnl #
 dnl # Check if cc supports -Wno-clobbered option.
 dnl #
 dnl # We actually invoke it with the -Wclobbered option
 dnl # and infer the 'no-' version does or doesn't exist based upon
 dnl # the results.  This is required because when checking any of
 dnl # no- prefixed options gcc always returns success.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_CLOBBERED], [
 	AC_MSG_CHECKING([whether $CC supports -Wno-clobbered])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Wclobbered"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		NO_CLOBBERED=-Wno-clobbered
 		AC_MSG_RESULT([yes])
 	], [
 		NO_CLOBBERED=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([NO_CLOBBERED])
 ])
 
 dnl #
 dnl # Check if cc supports -Wimplicit-fallthrough option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH], [
 	AC_MSG_CHECKING([whether $CC supports -Wimplicit-fallthrough])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Wimplicit-fallthrough"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		IMPLICIT_FALLTHROUGH=-Wimplicit-fallthrough
 		AC_DEFINE([HAVE_IMPLICIT_FALLTHROUGH], 1,
 			[Define if compiler supports -Wimplicit-fallthrough])
 		AC_MSG_RESULT([yes])
 	], [
 		IMPLICIT_FALLTHROUGH=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([IMPLICIT_FALLTHROUGH])
 ])
 
 dnl #
 dnl # Check if cc supports -Winfinite-recursion option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_INFINITE_RECURSION], [
 	AC_MSG_CHECKING([whether $CC supports -Winfinite-recursion])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Winfinite-recursion"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		INFINITE_RECURSION=-Winfinite-recursion
 		AC_DEFINE([HAVE_INFINITE_RECURSION], 1,
 			[Define if compiler supports -Winfinite-recursion])
 		AC_MSG_RESULT([yes])
 	], [
 		INFINITE_RECURSION=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([INFINITE_RECURSION])
 ])
 
 dnl #
 dnl # Check if kernel cc supports -Winfinite-recursion option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_INFINITE_RECURSION], [
-	AC_MSG_CHECKING([whether $KERNEL_CC supports -Winfinite-recursion])
-
 	saved_cc="$CC"
+	AS_IF(
+		[ test -n "$KERNEL_CC" ], [ CC="$KERNEL_CC" ],
+		[ test -n "$KERNEL_LLVM" ], [ CC="clang" ],
+		[ CC="gcc" ]
+	)
+	AC_MSG_CHECKING([whether $CC supports -Winfinite-recursion])
+
 	saved_flags="$CFLAGS"
-	CC="gcc"
 	CFLAGS="$CFLAGS -Werror -Winfinite-recursion"
 
-	AS_IF([ test -n "$KERNEL_CC" ], [
-		CC="$KERNEL_CC"
-	])
-	AS_IF([ test -n "$KERNEL_LLVM" ], [
-		CC="clang"
-	])
-
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		KERNEL_INFINITE_RECURSION=-Winfinite-recursion
 		AC_DEFINE([HAVE_KERNEL_INFINITE_RECURSION], 1,
 			[Define if compiler supports -Winfinite-recursion])
 		AC_MSG_RESULT([yes])
 	], [
 		KERNEL_INFINITE_RECURSION=
 		AC_MSG_RESULT([no])
 	])
 
 	CC="$saved_cc"
 	CFLAGS="$saved_flags"
 	AC_SUBST([KERNEL_INFINITE_RECURSION])
 ])
 
 dnl #
 dnl # Check if cc supports -Wformat-overflow option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_FORMAT_OVERFLOW], [
 	AC_MSG_CHECKING([whether $CC supports -Wformat-overflow])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -Wformat-overflow"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		FORMAT_OVERFLOW=-Wformat-overflow
 		AC_DEFINE([HAVE_FORMAT_OVERFLOW], 1,
 			[Define if compiler supports -Wformat-overflow])
 		AC_MSG_RESULT([yes])
 	], [
 		FORMAT_OVERFLOW=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([FORMAT_OVERFLOW])
 ])
 
 dnl #
 dnl # Check if cc supports -fno-omit-frame-pointer option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER], [
 	AC_MSG_CHECKING([whether $CC supports -fno-omit-frame-pointer])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -fno-omit-frame-pointer"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		NO_OMIT_FRAME_POINTER=-fno-omit-frame-pointer
 		AC_MSG_RESULT([yes])
 	], [
 		NO_OMIT_FRAME_POINTER=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([NO_OMIT_FRAME_POINTER])
 ])
 
 dnl #
 dnl # Check if cc supports -fno-ipa-sra option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA], [
 	AC_MSG_CHECKING([whether $CC supports -fno-ipa-sra])
 
 	saved_flags="$CFLAGS"
 	CFLAGS="$CFLAGS -Werror -fno-ipa-sra"
 
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		NO_IPA_SRA=-fno-ipa-sra
 		AC_MSG_RESULT([yes])
 	], [
 		NO_IPA_SRA=
 		AC_MSG_RESULT([no])
 	])
 
 	CFLAGS="$saved_flags"
 	AC_SUBST([NO_IPA_SRA])
 ])
 
 dnl #
 dnl # Check if kernel cc supports -fno-ipa-sra option.
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_IPA_SRA], [
-	AC_MSG_CHECKING([whether $KERNEL_CC supports -fno-ipa-sra])
-
 	saved_cc="$CC"
+	AS_IF(
+		[ test -n "$KERNEL_CC" ], [ CC="$KERNEL_CC" ],
+		[ test -n "$KERNEL_LLVM" ], [ CC="clang" ],
+		[ CC="gcc" ]
+	)
+	AC_MSG_CHECKING([whether $CC supports -fno-ipa-sra])
+
 	saved_flags="$CFLAGS"
-	CC="gcc"
 	CFLAGS="$CFLAGS -Werror -fno-ipa-sra"
 
-	AS_IF([ test -n "$KERNEL_CC" ], [
-		CC="$KERNEL_CC"
-	])
-	AS_IF([ test -n "$KERNEL_LLVM" ], [
-		CC="clang"
-	])
-
 	AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [
 		KERNEL_NO_IPA_SRA=-fno-ipa-sra
 		AC_MSG_RESULT([yes])
 	], [
 		KERNEL_NO_IPA_SRA=
 		AC_MSG_RESULT([no])
 	])
 
 	CC="$saved_cc"
 	CFLAGS="$saved_flags"
 	AC_SUBST([KERNEL_NO_IPA_SRA])
 ])
diff --git a/sys/contrib/openzfs/config/kernel-blkdev.m4 b/sys/contrib/openzfs/config/kernel-blkdev.m4
index 83190c6fbe3f..02011bf39fb2 100644
--- a/sys/contrib/openzfs/config/kernel-blkdev.m4
+++ b/sys/contrib/openzfs/config/kernel-blkdev.m4
@@ -1,784 +1,781 @@
 dnl #
 dnl # 2.6.38 API change,
 dnl # Added blkdev_get_by_path()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [
 	ZFS_LINUX_TEST_SRC([blkdev_get_by_path], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
 
 		bdev = blkdev_get_by_path(path, mode, holder);
 	])
 ])
 
 dnl #
 dnl # 6.5.x API change,
 dnl # blkdev_get_by_path() takes 4 args
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [
 	ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
-		struct blk_holder_ops h;
 
-		bdev = blkdev_get_by_path(path, mode, holder, &h);
+		bdev = blkdev_get_by_path(path, mode, holder, NULL);
 	])
 ])
 
 dnl #
 dnl # 6.8.x API change
 dnl # bdev_open_by_path() replaces blkdev_get_by_path()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [
 	ZFS_LINUX_TEST_SRC([bdev_open_by_path], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct bdev_handle *bdh __attribute__ ((unused)) = NULL;
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
-		struct blk_holder_ops h;
 
-		bdh = bdev_open_by_path(path, mode, holder, &h);
+		bdh = bdev_open_by_path(path, mode, holder, NULL);
 	])
 ])
 
 dnl #
 dnl # 6.9.x API change
 dnl # bdev_file_open_by_path() replaced bdev_open_by_path(),
 dnl # and returns struct file*
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH], [
 	ZFS_LINUX_TEST_SRC([bdev_file_open_by_path], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct file *file __attribute__ ((unused)) = NULL;
 		const char *path = "path";
 		fmode_t mode = 0;
 		void *holder = NULL;
-		struct blk_holder_ops h;
 
-		file = bdev_file_open_by_path(path, mode, holder, &h);
+		file = bdev_file_open_by_path(path, mode, holder, NULL);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [
 	AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args])
 	ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [
 		AC_MSG_RESULT(yes)
 	], [
 		AC_MSG_RESULT(no)
 		AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args])
 		ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [
 			AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1,
 				[blkdev_get_by_path() exists and takes 4 args])
 			AC_MSG_RESULT(yes)
 		], [
 			AC_MSG_RESULT(no)
 			AC_MSG_CHECKING([whether bdev_open_by_path() exists])
 			ZFS_LINUX_TEST_RESULT([bdev_open_by_path], [
 				AC_DEFINE(HAVE_BDEV_OPEN_BY_PATH, 1,
 					[bdev_open_by_path() exists])
 				AC_MSG_RESULT(yes)
 			], [
 				AC_MSG_RESULT(no)
 				AC_MSG_CHECKING([whether bdev_file_open_by_path() exists])
 				ZFS_LINUX_TEST_RESULT([bdev_file_open_by_path], [
 					AC_DEFINE(HAVE_BDEV_FILE_OPEN_BY_PATH, 1,
 						[bdev_file_open_by_path() exists])
 					AC_MSG_RESULT(yes)
 				], [
 					AC_MSG_RESULT(no)
 					ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()])
 				])
 			])
 		])
 	])
 ])
 
 dnl #
 dnl # 6.5.x API change
 dnl # blk_mode_t was added as a type to supercede some places where fmode_t
 dnl # is used
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [
 	ZFS_LINUX_TEST_SRC([blk_mode_t], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		blk_mode_t m __attribute((unused)) = (blk_mode_t)0;
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [
 	AC_MSG_CHECKING([whether blk_mode_t is defined])
 	ZFS_LINUX_TEST_RESULT([blk_mode_t], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 2.6.38 API change,
 dnl # Added blkdev_put()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [
 	ZFS_LINUX_TEST_SRC([blkdev_put], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev = NULL;
 		fmode_t mode = 0;
 
 		blkdev_put(bdev, mode);
 	])
 ])
 
 dnl #
 dnl # 6.5.x API change.
 dnl # blkdev_put() takes (void* holder) as arg 2
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [
 	ZFS_LINUX_TEST_SRC([blkdev_put_holder], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev = NULL;
 		void *holder = NULL;
 
 		blkdev_put(bdev, holder);
 	])
 ])
 
 dnl #
 dnl # 6.8.x API change
 dnl # bdev_release() replaces blkdev_put()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [
 	ZFS_LINUX_TEST_SRC([bdev_release], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct bdev_handle *bdh = NULL;
 		bdev_release(bdh);
 	])
 ])
 
 dnl #
 dnl # 6.9.x API change
 dnl #
 dnl # bdev_release() now private, but because bdev_file_open_by_path() returns
 dnl # struct file*, we can just use fput(). So the blkdev_put test no longer
 dnl # fails if not found.
 dnl #
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [
 	AC_MSG_CHECKING([whether blkdev_put() exists])
 	ZFS_LINUX_TEST_RESULT([blkdev_put], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_PUT, 1, [blkdev_put() exists])
 	], [
 		AC_MSG_RESULT(no)
 		AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2])
 		ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1,
 				[blkdev_put() accepts void* as arg 2])
 		], [
 			AC_MSG_RESULT(no)
 			AC_MSG_CHECKING([whether bdev_release() exists])
 			ZFS_LINUX_TEST_RESULT([bdev_release], [
 				AC_MSG_RESULT(yes)
 				AC_DEFINE(HAVE_BDEV_RELEASE, 1,
 					[bdev_release() exists])
 			], [
 				AC_MSG_RESULT(no)
 			])
 		])
 	])
 ])
 
 dnl #
 dnl # 4.1 API, exported blkdev_reread_part() symbol, back ported to the
 dnl # 3.10.0 CentOS 7.x enterprise kernels.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART], [
 	ZFS_LINUX_TEST_SRC([blkdev_reread_part], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev = NULL;
 		int error;
 
 		error = blkdev_reread_part(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [
 	AC_MSG_CHECKING([whether blkdev_reread_part() exists])
 	ZFS_LINUX_TEST_RESULT([blkdev_reread_part], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1,
 		    [blkdev_reread_part() exists])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # check_disk_change() was removed in 5.10
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE], [
 	ZFS_LINUX_TEST_SRC([check_disk_change], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev = NULL;
 		bool error;
 
 		error = check_disk_change(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [
 	AC_MSG_CHECKING([whether check_disk_change() exists])
 	ZFS_LINUX_TEST_RESULT([check_disk_change], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_CHECK_DISK_CHANGE, 1,
 		    [check_disk_change() exists])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 6.5.x API change
 dnl # disk_check_media_change() was added
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
 	ZFS_LINUX_TEST_SRC([disk_check_media_change], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev = NULL;
 		bool error;
 
 		error = disk_check_media_change(bdev->bd_disk);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [
 	AC_MSG_CHECKING([whether disk_check_media_change() exists])
 	ZFS_LINUX_TEST_RESULT([disk_check_media_change], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1,
 		    [disk_check_media_change() exists])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # bdev_kobj() is introduced from 5.12
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ], [
 	ZFS_LINUX_TEST_SRC([bdev_kobj], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 		#include <linux/kobject.h>
 	], [
 		struct block_device *bdev = NULL;
 		struct kobject *disk_kobj;
 		disk_kobj = bdev_kobj(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ], [
 	AC_MSG_CHECKING([whether bdev_kobj() exists])
 	ZFS_LINUX_TEST_RESULT([bdev_kobj], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BDEV_KOBJ, 1,
 		    [bdev_kobj() exists])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # part_to_dev() was removed in 5.12
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV], [
 	ZFS_LINUX_TEST_SRC([part_to_dev], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct hd_struct *p = NULL;
 		struct device *pdev;
 		pdev = part_to_dev(p);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV], [
 	AC_MSG_CHECKING([whether part_to_dev() exists])
 	ZFS_LINUX_TEST_RESULT([part_to_dev], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_PART_TO_DEV, 1,
 		    [part_to_dev() exists])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 5.10 API, check_disk_change() is removed, in favor of
 dnl # bdev_check_media_change(), which doesn't force revalidation
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [
 	ZFS_LINUX_TEST_SRC([bdev_check_media_change], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev = NULL;
 		int error;
 
 		error = bdev_check_media_change(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [
 	AC_MSG_CHECKING([whether bdev_check_media_change() exists])
 	ZFS_LINUX_TEST_RESULT([bdev_check_media_change], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BDEV_CHECK_MEDIA_CHANGE, 1,
 		    [bdev_check_media_change() exists])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 2.6.22 API change
 dnl # Single argument invalidate_bdev()
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV], [
 	ZFS_LINUX_TEST_SRC([invalidate_bdev], [
 		#include <linux/buffer_head.h>
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		invalidate_bdev(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [
 	AC_MSG_CHECKING([whether invalidate_bdev() exists])
 	ZFS_LINUX_TEST_RESULT([invalidate_bdev], [
 		AC_MSG_RESULT(yes)
 	],[
 		ZFS_LINUX_TEST_ERROR([invalidate_bdev()])
 	])
 ])
 
 dnl #
 dnl # 5.11 API, lookup_bdev() takes dev_t argument.
 dnl # 2.6.27 API, lookup_bdev() was first exported.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [
 	ZFS_LINUX_TEST_SRC([lookup_bdev_devt], [
 		#include <linux/blkdev.h>
 	], [
 		int error __attribute__ ((unused));
 		const char path[] = "/example/path";
 		dev_t dev;
 
 		error = lookup_bdev(path, &dev);
 	])
 
 	ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev __attribute__ ((unused));
 		const char path[] = "/example/path";
 
 		bdev = lookup_bdev(path);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [
 	AC_MSG_CHECKING([whether lookup_bdev() wants dev_t arg])
 	ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_devt],
 	    [lookup_bdev], [fs/block_dev.c], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_DEVT_LOOKUP_BDEV, 1,
 		    [lookup_bdev() wants dev_t arg])
 	], [
 		AC_MSG_RESULT(no)
 
 		AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg])
 		ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg],
 		    [lookup_bdev], [fs/block_dev.c], [
 			AC_MSG_RESULT(yes)
 			AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1,
 			    [lookup_bdev() wants 1 arg])
 		], [
 			ZFS_LINUX_TEST_ERROR([lookup_bdev()])
 		])
 	])
 ])
 
 dnl #
 dnl # 2.6.30 API change
 dnl #
 dnl # The bdev_physical_block_size() interface was added to provide a way
 dnl # to determine the smallest write which can be performed without a
 dnl # read-modify-write operation.
 dnl #
 dnl # Unfortunately, this interface isn't entirely reliable because
 dnl # drives are sometimes known to misreport this value.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [
 	ZFS_LINUX_TEST_SRC([bdev_physical_block_size], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		bdev_physical_block_size(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [
 	AC_MSG_CHECKING([whether bdev_physical_block_size() is available])
 	ZFS_LINUX_TEST_RESULT([bdev_physical_block_size], [
 		AC_MSG_RESULT(yes)
 	],[
 		ZFS_LINUX_TEST_ERROR([bdev_physical_block_size()])
 	])
 ])
 
 dnl #
 dnl # 2.6.30 API change
 dnl # Added bdev_logical_block_size().
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [
 	ZFS_LINUX_TEST_SRC([bdev_logical_block_size], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		bdev_logical_block_size(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [
 	AC_MSG_CHECKING([whether bdev_logical_block_size() is available])
 	ZFS_LINUX_TEST_RESULT([bdev_logical_block_size], [
 		AC_MSG_RESULT(yes)
 	],[
 		ZFS_LINUX_TEST_ERROR([bdev_logical_block_size()])
 	])
 ])
 
 dnl #
 dnl # 5.11 API change
 dnl # Added bdev_whole() helper.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE], [
 	ZFS_LINUX_TEST_SRC([bdev_whole], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		bdev = bdev_whole(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [
 	AC_MSG_CHECKING([whether bdev_whole() is available])
 	ZFS_LINUX_TEST_RESULT([bdev_whole], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BDEV_WHOLE, 1, [bdev_whole() is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 5.16 API change
 dnl # Added bdev_nr_bytes() helper.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES], [
 	ZFS_LINUX_TEST_SRC([bdev_nr_bytes], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		loff_t nr_bytes __attribute__ ((unused)) = 0;
 		nr_bytes = bdev_nr_bytes(bdev);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES], [
 	AC_MSG_CHECKING([whether bdev_nr_bytes() is available])
 	ZFS_LINUX_TEST_RESULT([bdev_nr_bytes], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BDEV_NR_BYTES, 1, [bdev_nr_bytes() is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 5.20 API change,
 dnl # Removed bdevname(), snprintf(.., %pg) should be used.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME], [
 	ZFS_LINUX_TEST_SRC([bdevname], [
 		#include <linux/fs.h>
 		#include <linux/blkdev.h>
 	], [
 		struct block_device *bdev __attribute__ ((unused)) = NULL;
 		char path[BDEVNAME_SIZE];
 
 		(void) bdevname(bdev, path);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [
 	AC_MSG_CHECKING([whether bdevname() exists])
 	ZFS_LINUX_TEST_RESULT([bdevname], [
 		AC_DEFINE(HAVE_BDEVNAME, 1, [bdevname() is available])
 		AC_MSG_RESULT(yes)
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # TRIM support: discard and secure erase. We make use of asynchronous
 dnl #               functions when available.
 dnl #
 dnl # 3.10:
 dnl #   sync discard:  blkdev_issue_discard(..., 0)
 dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
 dnl #   async discard: [not available]
 dnl #   async erase:   [not available]
 dnl #
 dnl # 4.7:
 dnl #   sync discard:  blkdev_issue_discard(..., 0)
 dnl #   sync erase:    blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
 dnl #   async discard: __blkdev_issue_discard(..., 0)
 dnl #   async erase:   __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE)
 dnl #
 dnl # 5.19:
 dnl #   sync discard:  blkdev_issue_discard(...)
 dnl #   sync erase:    blkdev_issue_secure_erase(...)
 dnl #   async discard: __blkdev_issue_discard(...)
 dnl #   async erase:   [not available]
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD], [
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_noflags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
 		int error __attribute__ ((unused));
 
 		error = blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL);
 	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
 		unsigned long flags = 0;
 		int error __attribute__ ((unused));
 
 		error = blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL, flags);
 	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_noflags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
 		struct bio *biop = NULL;
 		int error __attribute__ ((unused));
 
 		error = __blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL, &biop);
 	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
 		unsigned long flags = 0;
 		struct bio *biop = NULL;
 		int error __attribute__ ((unused));
 
 		error = __blkdev_issue_discard(bdev,
 		    sector, nr_sects, GFP_KERNEL, flags, &biop);
 	])
 	ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [
 		#include <linux/blkdev.h>
 	],[
 		struct block_device *bdev = NULL;
 		sector_t sector = 0;
 		sector_t nr_sects = 0;
 		int error __attribute__ ((unused));
 
 		error = blkdev_issue_secure_erase(bdev,
 		    sector, nr_sects, GFP_KERNEL);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD], [
 	AC_MSG_CHECKING([whether blkdev_issue_discard() is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_noflags], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS, 1,
 		    [blkdev_issue_discard() is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 	AC_MSG_CHECKING([whether blkdev_issue_discard(flags) is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_FLAGS, 1,
 		    [blkdev_issue_discard(flags) is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 	AC_MSG_CHECKING([whether __blkdev_issue_discard() is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_noflags], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS, 1,
 		    [__blkdev_issue_discard() is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 	AC_MSG_CHECKING([whether __blkdev_issue_discard(flags) is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS, 1,
 		    [__blkdev_issue_discard(flags) is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 	AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available])
 	ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_ISSUE_SECURE_ERASE, 1,
 		    [blkdev_issue_secure_erase() is available])
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 5.13 API change
 dnl # blkdev_get_by_path() no longer handles ERESTARTSYS
 dnl #
 dnl # Unfortunately we're forced to rely solely on the kernel version
 dnl # number in order to determine the expected behavior.  This was an
 dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [
 	AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS])
 	AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [
 		AC_MSG_RESULT(yes)
 		AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1,
 			[blkdev_get_by_path() handles ERESTARTSYS])
 	],[
 		AC_MSG_RESULT(no)
 	],[
 		AC_MSG_RESULT(no)
 	])
 ])
 
 dnl #
 dnl # 6.5.x API change
 dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [
 	ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [
 		#include <linux/blkdev.h>
 	],[
 		blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT;
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [
 	AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined])
 		ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [
 			AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined])
 			AC_MSG_RESULT(yes)
 		], [
 			AC_MSG_RESULT(no)
 		])
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH
 	ZFS_AC_KERNEL_SRC_BDEV_FILE_OPEN_BY_PATH
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT
 	ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE
 	ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART
 	ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE
 	ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_NR_BYTES
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT
 	ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [
 	ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH
 	ZFS_AC_KERNEL_BLKDEV_PUT
 	ZFS_AC_KERNEL_BLKDEV_REREAD_PART
 	ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV
 	ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV
 	ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE
 	ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE
 	ZFS_AC_KERNEL_BLKDEV_BDEV_NR_BYTES
 	ZFS_AC_KERNEL_BLKDEV_BDEVNAME
 	ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS
 	ZFS_AC_KERNEL_BLKDEV_ISSUE_DISCARD
 	ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ
 	ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV
 	ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE
 	ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT
 	ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T
 ])
diff --git a/sys/contrib/openzfs/config/kernel-dentry-operations.m4 b/sys/contrib/openzfs/config/kernel-dentry-operations.m4
index aa5a9f2aff39..6d87ad0e0710 100644
--- a/sys/contrib/openzfs/config/kernel-dentry-operations.m4
+++ b/sys/contrib/openzfs/config/kernel-dentry-operations.m4
@@ -1,55 +1,57 @@
 dnl #
 dnl # 2.6.28 API change
 dnl # Added d_obtain_alias() helper function.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS], [
 	ZFS_LINUX_TEST_SRC([d_obtain_alias], [
 		#include <linux/dcache.h>
 	], [
 		d_obtain_alias(NULL);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_D_OBTAIN_ALIAS], [
 	AC_MSG_CHECKING([whether d_obtain_alias() is available])
 	ZFS_LINUX_TEST_RESULT_SYMBOL([d_obtain_alias],
 	    [d_obtain_alias], [fs/dcache.c], [
 		AC_MSG_RESULT(yes)
 	], [
 		ZFS_LINUX_TEST_ERROR([d_obtain_alias()])
 	])
 ])
 
 dnl #
 dnl # 2.6.38 API change
 dnl # Added d_set_d_op() helper function.
 dnl #
+dnl # 6.17 API change
+dnl # d_set_d_op() removed. No direct replacement.
+dnl #
 AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [
 	ZFS_LINUX_TEST_SRC([d_set_d_op], [
 		#include <linux/dcache.h>
 	], [
 		d_set_d_op(NULL, NULL);
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [
 	AC_MSG_CHECKING([whether d_set_d_op() is available])
-	ZFS_LINUX_TEST_RESULT_SYMBOL([d_set_d_op],
-	    [d_set_d_op], [fs/dcache.c], [
+	ZFS_LINUX_TEST_RESULT([d_set_d_op], [
 		AC_MSG_RESULT(yes)
+		AC_DEFINE(HAVE_D_SET_D_OP, 1,
+		    [Define if d_set_d_op() is available])
 	], [
-		ZFS_LINUX_TEST_ERROR([d_set_d_op])
+		AC_MSG_RESULT(no)
 	])
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY], [
         ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS
         ZFS_AC_KERNEL_SRC_D_SET_D_OP
-        ZFS_AC_KERNEL_SRC_S_D_OP
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_DENTRY], [
         ZFS_AC_KERNEL_D_OBTAIN_ALIAS
         ZFS_AC_KERNEL_D_SET_D_OP
-        ZFS_AC_KERNEL_S_D_OP
 ])
diff --git a/sys/contrib/openzfs/config/kernel.m4 b/sys/contrib/openzfs/config/kernel.m4
index e3e7625db7d8..35819e4d68c5 100644
--- a/sys/contrib/openzfs/config/kernel.m4
+++ b/sys/contrib/openzfs/config/kernel.m4
@@ -1,1064 +1,1066 @@
 dnl #
 dnl # Default ZFS kernel configuration
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [
 	AM_COND_IF([BUILD_LINUX], [
 		dnl # Setup the kernel build environment.
 		ZFS_AC_KERNEL
 		ZFS_AC_QAT
 
 		dnl # Sanity checks for module building and CONFIG_* defines
 		ZFS_AC_KERNEL_CONFIG_DEFINED
 		ZFS_AC_MODULE_SYMVERS
 
 		dnl # Sequential ZFS_LINUX_TRY_COMPILE tests
 		ZFS_AC_KERNEL_FPU_HEADER
 		ZFS_AC_KERNEL_OBJTOOL_HEADER
 		ZFS_AC_KERNEL_MISC_MINOR
 		ZFS_AC_KERNEL_DECLARE_EVENT_CLASS
 
 		dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests
 		ZFS_AC_KERNEL_TEST_SRC
 		ZFS_AC_KERNEL_TEST_RESULT
 
 		AS_IF([test "$LINUX_OBJ" != "$LINUX"], [
 			KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ"
 		])
 
 		AC_SUBST(KERNEL_MAKE)
 	])
 ])
 
 dnl #
 dnl # Generate and compile all of the kernel API test cases to determine
 dnl # which interfaces are available.  By invoking the kernel build system
 dnl # only once the compilation can be done in parallel significantly
 dnl # speeding up the process.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [
 	ZFS_AC_KERNEL_SRC_TYPES
 	ZFS_AC_KERNEL_SRC_OBJTOOL
 	ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE
 	ZFS_AC_KERNEL_SRC_PDE_DATA
 	ZFS_AC_KERNEL_SRC_GENERIC_FADVISE
 	ZFS_AC_KERNEL_SRC_SCHED
 	ZFS_AC_KERNEL_SRC_USLEEP_RANGE
 	ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL
 	ZFS_AC_KERNEL_SRC_INODE_TIMES
 	ZFS_AC_KERNEL_SRC_PROC_OPERATIONS
 	ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS
 	ZFS_AC_KERNEL_SRC_BIO
 	ZFS_AC_KERNEL_SRC_BLKDEV
 	ZFS_AC_KERNEL_SRC_BLK_QUEUE
 	ZFS_AC_KERNEL_SRC_GENHD_FLAGS
 	ZFS_AC_KERNEL_SRC_REVALIDATE_DISK
 	ZFS_AC_KERNEL_SRC_GET_DISK_RO
 	ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY
 	ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE
 	ZFS_AC_KERNEL_SRC_XATTR
 	ZFS_AC_KERNEL_SRC_ACL
 	ZFS_AC_KERNEL_SRC_INODE_SETATTR
 	ZFS_AC_KERNEL_SRC_INODE_GETATTR
 	ZFS_AC_KERNEL_SRC_SHOW_OPTIONS
 	ZFS_AC_KERNEL_SRC_SHRINKER
 	ZFS_AC_KERNEL_SRC_MKDIR
 	ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS
 	ZFS_AC_KERNEL_SRC_CREATE
 	ZFS_AC_KERNEL_SRC_PERMISSION
 	ZFS_AC_KERNEL_SRC_TMPFILE
 	ZFS_AC_KERNEL_SRC_AUTOMOUNT
 	ZFS_AC_KERNEL_SRC_COMMIT_METADATA
 	ZFS_AC_KERNEL_SRC_SETATTR_PREPARE
 	ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED
+	ZFS_AC_KERNEL_SRC_DENTRY
 	ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SRC_SECURITY_INODE
 	ZFS_AC_KERNEL_SRC_FST_MOUNT
 	ZFS_AC_KERNEL_SRC_SB_DYING
 	ZFS_AC_KERNEL_SRC_SET_NLINK
 	ZFS_AC_KERNEL_SRC_SGET
 	ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_MIGRATE_FOLIO
 	ZFS_AC_KERNEL_SRC_VFS_MIGRATEPAGE
 	ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_SRC_VFS_READPAGES
 	ZFS_AC_KERNEL_SRC_VFS_WRITEPAGE
 	ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_SRC_VFS_IOV_ITER
 	ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_SRC_KMAP_LOCAL_PAGE
 	ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN
 	ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT
 	ZFS_AC_KERNEL_SRC_FPU
 	ZFS_AC_KERNEL_SRC_FMODE_T
 	ZFS_AC_KERNEL_SRC_KUIDGID_T
 	ZFS_AC_KERNEL_SRC_KUID_HELPERS
 	ZFS_AC_KERNEL_SRC_RENAME
 	ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC
 	ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES
 	ZFS_AC_KERNEL_SRC_PERCPU
 	ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR
 	ZFS_AC_KERNEL_SRC_MKNOD
 	ZFS_AC_KERNEL_SRC_SYMLINK
 	ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS
 	ZFS_AC_KERNEL_SRC_SIGINFO
 	ZFS_AC_KERNEL_SRC_SYSFS
 	ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG
 	ZFS_AC_KERNEL_SRC_STRLCPY
 	ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT
 	ZFS_AC_KERNEL_SRC_PAGEMAP_READAHEAD_PAGE
 	ZFS_AC_KERNEL_SRC_ADD_DISK
 	ZFS_AC_KERNEL_SRC_KTHREAD
 	ZFS_AC_KERNEL_SRC_ZERO_PAGE
 	ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC
 	ZFS_AC_KERNEL_SRC_IDMAP_MNT_API
 	ZFS_AC_KERNEL_SRC_IDMAP_NO_USERNS
 	ZFS_AC_KERNEL_SRC_IATTR_VFSID
 	ZFS_AC_KERNEL_SRC_WRITEPAGE_T
 	ZFS_AC_KERNEL_SRC_RECLAIMED
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ
 	ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SRC_SYNC_BDEV
 	ZFS_AC_KERNEL_SRC_MM_PAGE_FLAGS
 	ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE
 	ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING
 	ZFS_AC_KERNEL_SRC_FILE
 	ZFS_AC_KERNEL_SRC_PIN_USER_PAGES
 	ZFS_AC_KERNEL_SRC_TIMER
 	ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_WB_ERR
 	ZFS_AC_KERNEL_SRC_SOPS_FREE_INODE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE
 			ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
 			;;
 		riscv*)
 			ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE
 			;;
 	esac
 
 	AC_MSG_CHECKING([for available kernel interfaces])
 	ZFS_LINUX_TEST_COMPILE_ALL([kabi])
 	AC_MSG_RESULT([done])
 ])
 
 dnl #
 dnl # Check results of kernel interface tests.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [
 	ZFS_AC_KERNEL_TYPES
 	ZFS_AC_KERNEL_ACCESS_OK_TYPE
 	ZFS_AC_KERNEL_OBJTOOL
 	ZFS_AC_KERNEL_PDE_DATA
 	ZFS_AC_KERNEL_GENERIC_FADVISE
 	ZFS_AC_KERNEL_SCHED
 	ZFS_AC_KERNEL_USLEEP_RANGE
 	ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL
 	ZFS_AC_KERNEL_INODE_TIMES
 	ZFS_AC_KERNEL_PROC_OPERATIONS
 	ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS
 	ZFS_AC_KERNEL_BIO
 	ZFS_AC_KERNEL_BLKDEV
 	ZFS_AC_KERNEL_BLK_QUEUE
 	ZFS_AC_KERNEL_GENHD_FLAGS
 	ZFS_AC_KERNEL_REVALIDATE_DISK
 	ZFS_AC_KERNEL_GET_DISK_RO
 	ZFS_AC_KERNEL_DISCARD_GRANULARITY
 	ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE
 	ZFS_AC_KERNEL_XATTR
 	ZFS_AC_KERNEL_ACL
 	ZFS_AC_KERNEL_INODE_SETATTR
 	ZFS_AC_KERNEL_INODE_GETATTR
 	ZFS_AC_KERNEL_SHOW_OPTIONS
 	ZFS_AC_KERNEL_SHRINKER
 	ZFS_AC_KERNEL_MKDIR
 	ZFS_AC_KERNEL_LOOKUP_FLAGS
 	ZFS_AC_KERNEL_CREATE
 	ZFS_AC_KERNEL_PERMISSION
 	ZFS_AC_KERNEL_TMPFILE
 	ZFS_AC_KERNEL_AUTOMOUNT
 	ZFS_AC_KERNEL_COMMIT_METADATA
 	ZFS_AC_KERNEL_SETATTR_PREPARE
 	ZFS_AC_KERNEL_INSERT_INODE_LOCKED
+	ZFS_AC_KERNEL_DENTRY
 	ZFS_AC_KERNEL_TRUNCATE_SETSIZE
 	ZFS_AC_KERNEL_SECURITY_INODE
 	ZFS_AC_KERNEL_FST_MOUNT
 	ZFS_AC_KERNEL_SB_DYING
 	ZFS_AC_KERNEL_SET_NLINK
 	ZFS_AC_KERNEL_SGET
 	ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO
 	ZFS_AC_KERNEL_VFS_READ_FOLIO
 	ZFS_AC_KERNEL_VFS_MIGRATE_FOLIO
 	ZFS_AC_KERNEL_VFS_MIGRATEPAGE
 	ZFS_AC_KERNEL_VFS_FSYNC_2ARGS
 	ZFS_AC_KERNEL_VFS_READPAGES
 	ZFS_AC_KERNEL_VFS_WRITEPAGE
 	ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS
 	ZFS_AC_KERNEL_VFS_IOV_ITER
 	ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE
 	ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE
 	ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS
 	ZFS_AC_KERNEL_KMAP_LOCAL_PAGE
 	ZFS_AC_KERNEL_FOLLOW_DOWN_ONE
 	ZFS_AC_KERNEL_MAKE_REQUEST_FN
 	ZFS_AC_KERNEL_GENERIC_IO_ACCT
 	ZFS_AC_KERNEL_FPU
 	ZFS_AC_KERNEL_FMODE_T
 	ZFS_AC_KERNEL_KUIDGID_T
 	ZFS_AC_KERNEL_KUID_HELPERS
 	ZFS_AC_KERNEL_RENAME
 	ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC
 	ZFS_AC_KERNEL_TOTALHIGH_PAGES
 	ZFS_AC_KERNEL_PERCPU
 	ZFS_AC_KERNEL_GENERIC_FILLATTR
 	ZFS_AC_KERNEL_MKNOD
 	ZFS_AC_KERNEL_SYMLINK
 	ZFS_AC_KERNEL_BIO_MAX_SEGS
 	ZFS_AC_KERNEL_SIGINFO
 	ZFS_AC_KERNEL_SYSFS
 	ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG
 	ZFS_AC_KERNEL_STRLCPY
 	ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT
 	ZFS_AC_KERNEL_PAGEMAP_READAHEAD_PAGE
 	ZFS_AC_KERNEL_ADD_DISK
 	ZFS_AC_KERNEL_KTHREAD
 	ZFS_AC_KERNEL_ZERO_PAGE
 	ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC
 	ZFS_AC_KERNEL_IDMAP_MNT_API
 	ZFS_AC_KERNEL_IDMAP_NO_USERNS
 	ZFS_AC_KERNEL_IATTR_VFSID
 	ZFS_AC_KERNEL_WRITEPAGE_T
 	ZFS_AC_KERNEL_RECLAIMED
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE
 	ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ
 	ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST
 	ZFS_AC_KERNEL_COPY_SPLICE_READ
 	ZFS_AC_KERNEL_SYNC_BDEV
 	ZFS_AC_KERNEL_MM_PAGE_FLAGS
 	ZFS_AC_KERNEL_MM_PAGE_SIZE
 	ZFS_AC_KERNEL_MM_PAGE_MAPPING
 	ZFS_AC_KERNEL_1ARG_ASSIGN_STR
 	ZFS_AC_KERNEL_FILE
 	ZFS_AC_KERNEL_PIN_USER_PAGES
 	ZFS_AC_KERNEL_TIMER
 	ZFS_AC_KERNEL_SUPER_BLOCK_S_WB_ERR
 	ZFS_AC_KERNEL_SOPS_FREE_INODE
 	case "$host_cpu" in
 		powerpc*)
 			ZFS_AC_KERNEL_CPU_HAS_FEATURE
 			ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
 			;;
 		riscv*)
 			ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE
 			;;
 	esac
 ])
 
 dnl #
 dnl # Detect name used for Module.symvers file in kernel
 dnl #
 AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [
 	modpost=$LINUX/scripts/Makefile.modpost
 	AC_MSG_CHECKING([kernel file name for module symbols])
 	AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [
 		AS_IF([grep -q Modules.symvers $modpost], [
 			LINUX_SYMBOLS=Modules.symvers
 		], [
 			LINUX_SYMBOLS=Module.symvers
 		])
 
 		AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [
 			AC_MSG_ERROR([
 	*** Please make sure the kernel devel package for your distribution
 	*** is installed.  If you are building with a custom kernel, make sure
 	*** the kernel is configured, built, and the '--with-linux=PATH'
 	*** configure option refers to the location of the kernel source.
 			])
 		])
 	], [
 		LINUX_SYMBOLS=NONE
 	])
 	AC_MSG_RESULT($LINUX_SYMBOLS)
 	AC_SUBST(LINUX_SYMBOLS)
 ])
 
 dnl #
 dnl # Detect the kernel to be built against
 dnl #
 dnl # Most modern Linux distributions have separate locations for bare
 dnl # source (source) and prebuilt (build) files. Additionally, there are
 dnl # `source` and `build` symlinks in `/lib/modules/$(KERNEL_VERSION)`
 dnl # pointing to them. The directory search order is now:
 dnl # 
 dnl # - `configure` command line values if both `--with-linux` and
 dnl #   `--with-linux-obj` were defined
 dnl # 
 dnl # - If only `--with-linux` was defined, `--with-linux-obj` is assumed
 dnl #   to have the same value as `--with-linux`
 dnl # 
 dnl # - If neither `--with-linux` nor `--with-linux-obj` were defined
 dnl #   autodetection is used:
 dnl # 
 dnl #   - `/lib/modules/$(uname -r)/{source,build}` respectively, if exist.
 dnl # 
 dnl #   - If only `/lib/modules/$(uname -r)/build` exists, it is assumed
 dnl #     to be both source and build directory.
 dnl # 
 dnl #   - The first directory in `/lib/modules` with the highest version
 dnl #     number according to `sort -V` which contains both `source` and
 dnl #     `build` symlinks/directories. If module directory contains only
 dnl #     `build` component, it is assumed to be both source and build
 dnl #     directory.
 dnl # 
 dnl #   - Last resort: the first directory matching `/usr/src/kernels/*`
 dnl #     and `/usr/src/linux-*` with the highest version number according
 dnl #     to `sort -V` is assumed to be both source and build directory.
 dnl #
 AC_DEFUN([ZFS_AC_KERNEL], [
 	AC_ARG_WITH([linux],
 		AS_HELP_STRING([--with-linux=PATH],
 		[Path to kernel source]),
 		[kernelsrc="$withval"])
 
 	AC_ARG_WITH(linux-obj,
 		AS_HELP_STRING([--with-linux-obj=PATH],
 		[Path to kernel build objects]),
 		[kernelbuild="$withval"])
 
 	AC_MSG_CHECKING([kernel source and build directories])
 	AS_IF([test -n "$kernelsrc" && test -z "$kernelbuild"], [
 		kernelbuild="$kernelsrc"
 	], [test -z "$kernelsrc"], [
 		AS_IF([test -e "/lib/modules/$(uname -r)/source" && \
 		       test -e "/lib/modules/$(uname -r)/build"], [
 			src="/lib/modules/$(uname -r)/source"
 			build="/lib/modules/$(uname -r)/build"
 		], [test -e "/lib/modules/$(uname -r)/build"], [
 			build="/lib/modules/$(uname -r)/build"
 			src="$build"
 		], [
 			src=
 
 			for d in $(ls -1d /lib/modules/* 2>/dev/null | sort -Vr); do
 				if test -e "$d/source" && test -e "$d/build"; then
 					src="$d/source"
 					build="$d/build"
 					break
 				fi
 
 				if test -e "$d/build"; then
 					src="$d/build"
 					build="$d/build"
 					break
 				fi
 			done
 
 			# the least reliable method
 			if test -z "$src"; then
 				src=$(ls -1d /usr/src/kernels/* /usr/src/linux-* \
 				      2>/dev/null | grep -v obj | sort -Vr | head -1)
 				build="$src"
 			fi
 		])
 
 		AS_IF([test -n "$src" && test -e "$src"], [
 			kernelsrc=$(readlink -e "$src")
 		], [
 			kernelsrc="[Not found]"
 		])
 		AS_IF([test -n "$build" && test -e "$build"], [
 			kernelbuild=$(readlink -e "$build")
 		], [
 			kernelbuild="[Not found]"
 		])
 	], [
 		AS_IF([test "$kernelsrc" = "NONE"], [
 			kernsrcver=NONE
 		])
 		withlinux=yes
 	])
 
 	AC_MSG_RESULT([done])
 	AC_MSG_CHECKING([kernel source directory])
 	AC_MSG_RESULT([$kernelsrc])
 	AC_MSG_CHECKING([kernel build directory])
 	AC_MSG_RESULT([$kernelbuild])
 	AS_IF([test ! -d "$kernelsrc" || test ! -d "$kernelbuild"], [
 		AC_MSG_ERROR([
 	*** Please make sure the kernel devel package for your distribution
 	*** is installed and then try again.  If that fails, you can specify the
 	*** location of the kernel source and build with the '--with-linux=PATH' and
 	*** '--with-linux-obj=PATH' options respectively.])
 	])
 
 	AC_MSG_CHECKING([kernel source version])
 	utsrelease1=$kernelbuild/include/linux/version.h
 	utsrelease2=$kernelbuild/include/linux/utsrelease.h
 	utsrelease3=$kernelbuild/include/generated/utsrelease.h
 	AS_IF([test -r $utsrelease1 && grep -qF UTS_RELEASE $utsrelease1], [
 		utsrelease=$utsrelease1
 	], [test -r $utsrelease2 && grep -qF UTS_RELEASE $utsrelease2], [
 		utsrelease=$utsrelease2
 	], [test -r $utsrelease3 && grep -qF UTS_RELEASE $utsrelease3], [
 		utsrelease=$utsrelease3
 	])
 
 	AS_IF([test -n "$utsrelease"], [
 		kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease)
 		AS_IF([test -z "$kernsrcver"], [
 			AC_MSG_RESULT([Not found])
 			AC_MSG_ERROR([
 	*** Cannot determine kernel version.
 			])
 		])
 	], [
 		AC_MSG_RESULT([Not found])
 		if test "x$enable_linux_builtin" != xyes; then
 			AC_MSG_ERROR([
 	*** Cannot find UTS_RELEASE definition.
 			])
 		else
 			AC_MSG_ERROR([
 	*** Cannot find UTS_RELEASE definition.
 	*** Please run 'make prepare' inside the kernel source tree.])
 		fi
 	])
 
 	AC_MSG_RESULT([$kernsrcver])
 
 	AX_COMPARE_VERSION([$kernsrcver], [ge], [$ZFS_META_KVER_MIN], [], [
 		AC_MSG_ERROR([
 	*** Cannot build against kernel version $kernsrcver.
 	*** The minimum supported kernel version is $ZFS_META_KVER_MIN.
 		])
 	])
 
 	AC_ARG_ENABLE([linux-experimental],
 		AS_HELP_STRING([--enable-linux-experimental],
 		[Allow building against some unsupported kernel versions]))
 
 	AX_COMPARE_VERSION([$kernsrcver], [ge], [$ZFS_META_KVER_MAX], [
 		AX_COMPARE_VERSION([$kernsrcver], [eq2], [$ZFS_META_KVER_MAX], [
 			kern_max_version_ok=yes
 		], [
 			kern_max_version_ok=no
 		])
 	], [
 		kern_max_version_ok=yes
 	])
 
 	AS_IF([test "x$kern_max_version_ok" != "xyes"], [
 		AS_IF([test "x$enable_linux_experimental" == "xyes"], [
 			AC_DEFINE(HAVE_LINUX_EXPERIMENTAL, 1,
 			    [building against unsupported kernel version])
 		], [
 			AC_MSG_ERROR([
 	*** Cannot build against kernel version $kernsrcver.
 	*** The maximum supported kernel version is $ZFS_META_KVER_MAX.
 			])
 		])
 	])
 
 	LINUX=${kernelsrc}
 	LINUX_OBJ=${kernelbuild}
 	LINUX_VERSION=${kernsrcver}
 
 	AC_SUBST(LINUX)
 	AC_SUBST(LINUX_OBJ)
 	AC_SUBST(LINUX_VERSION)
 
 	dnl # create a relatively unique numeric checksum based on the kernel
 	dnl # version and path. this is included in the cache key below,
 	dnl # allowing different cached values for different kernels
 	_zfs_linux_cache_checksum=$(echo ${kernelsrc} {$kernelbuild} ${kernsrcver} | cksum | cut -f1 -d' ')
 ])
 
 AC_DEFUN([ZFS_AC_KERNEL_VERSION_WARNING], [
 	AS_IF([test "x$enable_linux_experimental" = "xyes" && \
 	    test "x$kern_max_version_ok" != "xyes"], [
 		AC_MSG_WARN([
 
 	You are building OpenZFS against Linux version $kernsrcver.
 
 	This combination is considered EXPERIMENTAL by the OpenZFS project.
 	Even if it appears to build and run correctly, there may be bugs that
 	can cause SERIOUS DATA LOSS.
 
 	YOU HAVE BEEN WARNED!
 
 	If you choose to continue, we'd appreciate if you could report your
 	results on the OpenZFS issue tracker at:
 
 	    https://github.com/openzfs/zfs/issues/new
 
 	Your feedback will help us prepare a new OpenZFS release that supports
 	this version of Linux.
 		])
 	])
 ])
 
 dnl #
 dnl # Detect the QAT module to be built against, QAT provides hardware
 dnl # acceleration for data compression:
 dnl #
 dnl # https://01.org/intel-quickassist-technology
 dnl #
 dnl # 1) Download and install QAT driver from the above link
 dnl # 2) Start QAT driver in your system:
 dnl # 	 service qat_service start
 dnl # 3) Enable QAT in ZFS, e.g.:
 dnl # 	 ./configure --with-qat=<qat-driver-path>/QAT1.6
 dnl # 	 make
 dnl # 4) Set GZIP compression in ZFS dataset:
 dnl # 	 zfs set compression = gzip <dataset>
 dnl #
 dnl # Then the data written to this ZFS pool is compressed by QAT accelerator
 dnl # automatically, and de-compressed by QAT when read from the pool.
 dnl #
 dnl # 1) Get QAT hardware statistics with:
 dnl #	 cat /proc/icp_dh895xcc_dev/qat
 dnl # 2) To disable QAT:
 dnl # 	 insmod zfs.ko zfs_qat_disable=1
 dnl #
 AC_DEFUN([ZFS_AC_QAT], [
 	AC_ARG_WITH([qat],
 		AS_HELP_STRING([--with-qat=PATH],
 		[Path to qat source]),
 		AS_IF([test "$withval" = "yes"],
 			AC_MSG_ERROR([--with-qat=PATH requires a PATH]),
 			[qatsrc="$withval"]))
 
 	AC_ARG_WITH([qat-obj],
 		AS_HELP_STRING([--with-qat-obj=PATH],
 		[Path to qat build objects]),
 		[qatbuild="$withval"])
 
 	AS_IF([test ! -z "${qatsrc}"], [
 		AC_MSG_CHECKING([qat source directory])
 		AC_MSG_RESULT([$qatsrc])
 		QAT_SRC="${qatsrc}/quickassist"
 		AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [
 			AC_MSG_ERROR([
 	*** Please make sure the qat driver package is installed
 	*** and specify the location of the qat source with the
 	*** '--with-qat=PATH' option then try again. Failed to
 	*** find cpa.h in:
 	${QAT_SRC}/include])
 		])
 	])
 
 	AS_IF([test ! -z "${qatsrc}"], [
 		AC_MSG_CHECKING([qat build directory])
 		AS_IF([test -z "$qatbuild"], [
 			qatbuild="${qatsrc}/build"
 		])
 
 		AC_MSG_RESULT([$qatbuild])
 		QAT_OBJ=${qatbuild}
 		AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [
 			AC_MSG_ERROR([
 	*** Please make sure the qat driver is installed then try again.
 	*** Failed to find icp_qa_al.ko or qat_api.ko in:
 	$QAT_OBJ])
 		])
 
 		AC_SUBST(QAT_SRC)
 		AC_SUBST(QAT_OBJ)
 
 		AC_DEFINE(HAVE_QAT, 1,
 		[qat is enabled and existed])
 	])
 
 	dnl #
 	dnl # Detect the name used for the QAT Module.symvers file.
 	dnl #
 	AS_IF([test ! -z "${qatsrc}"], [
 		AC_MSG_CHECKING([qat file for module symbols])
 		QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers
 
 		AS_IF([test -r $QAT_SYMBOLS], [
 			AC_MSG_RESULT([$QAT_SYMBOLS])
 			AC_SUBST(QAT_SYMBOLS)
 		],[
 			AC_MSG_ERROR([
 	*** Please make sure the qat driver is installed then try again.
 	*** Failed to find Module.symvers in:
 	$QAT_SYMBOLS
 			])
 		])
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_CONFTEST_H
 dnl #
 AC_DEFUN([ZFS_LINUX_CONFTEST_H], [
 test -d build/$2 || mkdir -p build/$2
 cat - <<_ACEOF >build/$2/$2.h
 $1
 _ACEOF
 ])
 
 dnl #
 dnl # ZFS_LINUX_CONFTEST_C
 dnl #
 AC_DEFUN([ZFS_LINUX_CONFTEST_C], [
 test -d build/$2 || mkdir -p build/$2
 cat confdefs.h - <<_ACEOF >build/$2/$2.c
 $1
 _ACEOF
 ])
 
 dnl #
 dnl # ZFS_LINUX_CONFTEST_MAKEFILE
 dnl #
 dnl # $1 - test case name
 dnl # $2 - add to top-level Makefile
 dnl # $3 - additional build flags
 dnl #
 AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [
 	test -d build || mkdir -p build
 	test -d build/$1 || mkdir -p build/$1
 
 	file=build/$1/Makefile
 
 	dnl # Example command line to manually build source.
 	cat - <<_ACEOF >$file
 # Example command line to manually build source
 # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1
 
 ccflags-y := -Werror $FRAME_LARGER_THAN
 _ACEOF
 
 	dnl # Additional custom CFLAGS as requested.
 	m4_ifval($3, [echo "ccflags-y += $3" >>$file], [])
 
 	dnl # Test case source
 	echo "obj-m := $1.o" >>$file
 
 	AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], [])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY])
 dnl #
 m4_define([ZFS_LINUX_TEST_PROGRAM], [
 #include <linux/module.h>
 $1
 
 int
 main (void)
 {
 $2
 	;
 	return 0;
 }
 
 MODULE_DESCRIPTION("conftest");
 MODULE_AUTHOR(ZFS_META_AUTHOR);
 MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE);
 MODULE_LICENSE($3);
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_REMOVE
 dnl #
 dnl # Removes the specified test source and results.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [
 	test -d build/$1 && rm -Rf build/$1
 	test -f build/Makefile && sed '/$1/d' build/Makefile
 ])
 
 dnl #
 dnl # ZFS_LINUX_COMPILE
 dnl #
 dnl # $1 - build dir
 dnl # $2 - test command
 dnl # $3 - pass command
 dnl # $4 - fail command
 dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes'
 dnl # $6 - set KBUILD_MODPOST_WARN='yes'
 dnl #
 dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST}
 dnl #
 AC_DEFUN([ZFS_LINUX_COMPILE], [
 	AC_ARG_VAR([KERNEL_CC], [C compiler for
 		building kernel modules])
 	AC_ARG_VAR([KERNEL_LD], [Linker for
 		building kernel modules])
 	AC_ARG_VAR([KERNEL_LLVM], [Binary option to
 		build kernel modules with LLVM/CLANG toolchain])
 	AC_ARG_VAR([KERNEL_CROSS_COMPILE], [Cross compile prefix
 		for kernel module builds])
 	AC_ARG_VAR([KERNEL_ARCH], [Architecture to build kernel modules for])
 	AC_TRY_COMMAND([
 	    KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6"
 	    make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC}
 	    ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM}
 	    CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES
 	    ${KERNEL_CROSS_COMPILE:+CROSS_COMPILE=$KERNEL_CROSS_COMPILE}
 	    ${KERNEL_ARCH:+ARCH=$KERNEL_ARCH}
 	    -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1])
 	AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_COMPILE
 dnl #
 dnl # Perform a full compile excluding the final modpost phase.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [
 	ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [
 		mv $2/Makefile $2/Makefile.compile.$1
 		mv $2/build.log $2/build.log.$1
 	],[
 	        AC_MSG_ERROR([
         *** Unable to compile test source to determine kernel interfaces.])
 	], [yes], [])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_MODPOST
 dnl #
 dnl # Perform a full compile including the modpost phase.  This may
 dnl # be an incremental build if the objects have already been built.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [
 	ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [
 		mv $2/Makefile $2/Makefile.modpost.$1
 		cat $2/build.log >>build/build.log.$1
 	],[
 	        AC_MSG_ERROR([
         *** Unable to modpost test source to determine kernel interfaces.])
 	], [], [yes])
 ])
 
 dnl #
 dnl # Perform the compilation of the test cases in two phases.
 dnl #
 dnl # Phase 1) attempt to build the object files for all of the tests
 dnl #          defined by the ZFS_LINUX_TEST_SRC macro.  But do not
 dnl #          perform the final modpost stage.
 dnl #
 dnl # Phase 2) disable all tests which failed the initial compilation,
 dnl #          then invoke the final modpost step for the remaining tests.
 dnl #
 dnl # This allows us efficiently build the test cases in parallel while
 dnl # remaining resilient to build failures which are expected when
 dnl # detecting the available kernel interfaces.
 dnl #
 dnl # The maximum allowed parallelism can be controlled by setting the
 dnl # TEST_JOBS environment variable.  Otherwise, it default to $(nproc).
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [
 	dnl # Phase 1 - Compilation only, final linking is skipped.
 	ZFS_LINUX_TEST_COMPILE([$1], [build])
 
 	dnl #
 	dnl # Phase 2 - When building external modules disable test cases
 	dnl # which failed to compile and invoke modpost to verify the
 	dnl # final linking.
 	dnl #
 	dnl # Test names suffixed with '_license' call modpost independently
 	dnl # to ensure that a single incompatibility does not result in the
 	dnl # modpost phase exiting early.  This check is not performed on
 	dnl # every symbol since the majority are compatible and doing so
 	dnl # would significantly slow down this phase.
 	dnl #
 	dnl # When configuring for builtin (--enable-linux-builtin)
 	dnl # fake the linking step artificially create the expected .ko
 	dnl # files for tests which did compile.  This is required for
 	dnl # kernels which do not have loadable module support or have
 	dnl # not yet been built.
 	dnl #
 	AS_IF([test "x$enable_linux_builtin" = "xno"], [
 		for dir in $(awk '/^obj-m/ { print [$]3 }' \
 		    build/Makefile.compile.$1); do
 			name=${dir%/}
 			AS_IF([test -f build/$name/$name.o], [
 				AS_IF([test "${name##*_}" = "license"], [
 					ZFS_LINUX_TEST_MODPOST([$1],
 					    [build/$name])
 					echo "obj-n += $dir" >>build/Makefile
 				], [
 					echo "obj-m += $dir" >>build/Makefile
 				])
 			], [
 				echo "obj-n += $dir" >>build/Makefile
 			])
 		done
 
 		ZFS_LINUX_TEST_MODPOST([$1], [build])
 	], [
 		for dir in $(awk '/^obj-m/ { print [$]3 }' \
 		    build/Makefile.compile.$1); do
 			name=${dir%/}
 			AS_IF([test -f build/$name/$name.o], [
 				touch build/$name/$name.ko
 			])
 		done
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_SRC
 dnl #
 dnl # $1 - name
 dnl # $2 - global
 dnl # $3 - source
 dnl # $4 - extra cflags
 dnl # $5 - check license-compatibility
 dnl #
 dnl # Check if the test source is buildable at all and then if it is
 dnl # license compatible.
 dnl #
 dnl # N.B because all of the test cases are compiled in parallel they
 dnl # must never depend on the results of previous tests.  Each test
 dnl # needs to be entirely independent.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_SRC], [
 	cachevar="zfs_cv_kernel_[$1]_$_zfs_linux_cache_checksum"
 	eval "cacheval=\$$cachevar"
 	AS_IF([test "x$cacheval" = "x"], [
 		ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]],
 		    [["Dual BSD/GPL"]])], [$1])
 		ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4])
 
 		AS_IF([ test -n "$5" ], [
 			ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM(
 			    [[$2]], [[$3]], [[$5]])], [$1_license])
 			ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4])
 		])
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_RESULT
 dnl #
 dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC)
 dnl # $2 - run on success (valid .ko generated)
 dnl # $3 - run on failure (unable to compile)
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_RESULT], [
 	cachevar="zfs_cv_kernel_[$1]_$_zfs_linux_cache_checksum"
 	AC_CACHE_VAL([$cachevar], [
 		AS_IF([test -d build/$1], [
 			AS_IF([test -f build/$1/$1.ko], [
 				eval "$cachevar=yes"
 			], [
 				eval "$cachevar=no"
 			])
 		], [
 			AC_MSG_ERROR([
 	*** No matching source for the "$1" test, check that
 	*** both the test source and result macros refer to the same name.
 			])
 		])
 	])
 	eval "cacheval=\$$cachevar"
 	AS_IF([test "x$cacheval" = "xyes"], [$2], [$3])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_ERROR
 dnl #
 dnl # Generic error message which can be used when none of the expected
 dnl # kernel interfaces were detected.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_ERROR], [
 	AC_MSG_ERROR([
 	*** None of the expected "$1" interfaces were detected.
 	*** This may be because your kernel version is newer than what is
 	*** supported, or you are using a patched custom kernel with
 	*** incompatible modifications.
 	***
 	*** ZFS Version: $ZFS_META_ALIAS
 	*** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX
 	])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TEST_RESULT_SYMBOL
 dnl #
 dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to
 dnl # verify symbol exports, unless --enable-linux-builtin was provided to
 dnl # configure.
 dnl #
 AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [
 	cachevar="zfs_cv_kernel_[$1]_$_zfs_linux_cache_checksum"
 	AC_CACHE_VAL([$cachevar], [
 		AS_IF([ ! test -f build/$1/$1.ko], [
 			eval "$cachevar=no"
 		], [
 			AS_IF([test "x$enable_linux_builtin" != "xyes"], [
 				ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [
 					eval "$cachevar=yes"
 				], [
 					eval "$cachevar=no"
 				])
 			], [
 				eval "$cachevar=yes"
 			])
 		])
 	])
 	eval "cacheval=\$$cachevar"
 	AS_IF([test "x$cacheval" = "xyes"], [$4], [$5])
 ])
 
 dnl #
 dnl # ZFS_LINUX_COMPILE_IFELSE
 dnl #
 AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [
 	ZFS_LINUX_TEST_REMOVE([conftest])
 
 	m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])])
 	m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])],
 	    [ZFS_LINUX_CONFTEST_H([], [conftest])])
 
 	ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no],
 	    [m4_ifvaln([$5], [-I$PWD/build/conftest], [])])
 	ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], [])
 ])
 
 dnl #
 dnl # ZFS_LINUX_TRY_COMPILE
 dnl #
 dnl # $1 - global
 dnl # $2 - source
 dnl # $3 - run on success (valid .ko generated)
 dnl # $4 - run on failure (unable to compile)
 dnl #
 dnl # When configuring as builtin (--enable-linux-builtin) for kernels
 dnl # without loadable module support (CONFIG_MODULES=n) only the object
 dnl # file is created.  See ZFS_LINUX_TEST_COMPILE_ALL for details.
 dnl #
 AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [
 	AS_IF([test "x$enable_linux_builtin" = "xyes"], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.o], [$3], [$4])
 	], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.ko], [$3], [$4])
 	])
 ])
 
 dnl #
 dnl # ZFS_CHECK_SYMBOL_EXPORT
 dnl #
 dnl # Check if a symbol is exported on not by consulting the symbols
 dnl # file, or optionally the source code.
 dnl #
 AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [
 	grep -q -E '[[[:space:]]]$1[[[:space:]]]' \
 		$LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null
 	rc=$?
 	if test $rc -ne 0; then
 		export=0
 		for file in $2; do
 			grep -q -E "EXPORT_SYMBOL.*($1)" \
 				"$LINUX/$file" 2>/dev/null
 			rc=$?
 			if test $rc -eq 0; then
 				export=1
 				break;
 			fi
 		done
 		if test $export -eq 0; then :
 			$4
 		else :
 			$3
 		fi
 	else :
 		$3
 	fi
 ])
 
 dnl #
 dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL
 dnl #
 dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called
 dnl # to verify symbol exports, unless --enable-linux-builtin was provided
 dnl # to configure.
 dnl #
 AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [
 	ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1])
 	if test $rc -ne 0; then :
 		$6
 	else
 		if test "x$enable_linux_builtin" != xyes; then
 			ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1])
 		fi
 		if test $rc -ne 0; then :
 			$6
 		else :
 			$5
 		fi
 	fi
 ])
 
 dnl #
 dnl # ZFS_LINUX_TRY_COMPILE_HEADER
 dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are
 dnl # provided via the fifth parameter
 dnl #
 AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [
 	AS_IF([test "x$enable_linux_builtin" = "xyes"], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.o], [$3], [$4], [$5])
 	], [
 		ZFS_LINUX_COMPILE_IFELSE(
 		    [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]],
 		    [[ZFS_META_LICENSE]])],
 		    [test -f build/conftest/conftest.ko], [$3], [$4], [$5])
 	])
 ])
 
 dnl #
 dnl # AS_VERSION_COMPARE_LE
 dnl # like AS_VERSION_COMPARE_LE, but runs $3 if (and only if) $1 <= $2
 dnl # AS_VERSION_COMPARE_LE (version-1, version-2, [action-if-less-or-equal], [action-if-greater])
 dnl #
 AC_DEFUN([AS_VERSION_COMPARE_LE], [
 	AS_VERSION_COMPARE([$1], [$2], [$3], [$3], [$4])
 ])
 
 dnl #
 dnl # ZFS_LINUX_REQUIRE_API
 dnl # like ZFS_LINUX_TEST_ERROR, except only fails if the kernel is
 dnl # at least some specified version.
 dnl #
 AC_DEFUN([ZFS_LINUX_REQUIRE_API], [
 	AS_VERSION_COMPARE_LE([$2], [$kernsrcver], [
 		AC_MSG_ERROR([
 		*** None of the expected "$1" interfaces were detected. This
 		*** interface is expected for kernels version "$2" and above.
 		*** This may be because your kernel version is newer than what is
 		*** supported, or you are using a patched custom kernel with
 		*** incompatible modifications.  Newer kernels may have incompatible
 		*** APIs.
 		***
 		*** ZFS Version: $ZFS_META_ALIAS
 		*** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX
 		])
 	], [
 		AC_MSG_RESULT(no)
 	])
 ])
diff --git a/sys/contrib/openzfs/config/user-statx.m4 b/sys/contrib/openzfs/config/user-statx.m4
index 0315f93e0c20..1ba74a40e9b8 100644
--- a/sys/contrib/openzfs/config/user-statx.m4
+++ b/sys/contrib/openzfs/config/user-statx.m4
@@ -1,34 +1,34 @@
 dnl #
 dnl # Check for statx() function and STATX_MNT_ID availability
 dnl #
 AC_DEFUN([ZFS_AC_CONFIG_USER_STATX], [
-	AC_CHECK_HEADERS([linux/stat.h],
+	AC_CHECK_HEADERS([sys/stat.h],
 		[have_stat_headers=yes],
 		[have_stat_headers=no])
 
 	AS_IF([test "x$have_stat_headers" = "xyes"], [
 		AC_CHECK_FUNC([statx], [
 			AC_DEFINE([HAVE_STATX], [1], [statx() is available])
 
 			dnl Check for STATX_MNT_ID availability
 			AC_MSG_CHECKING([for STATX_MNT_ID])
 			AC_COMPILE_IFELSE([
 				AC_LANG_PROGRAM([[
-					#include <linux/stat.h>
+					#include <sys/stat.h>
 				]], [[
 					struct statx stx;
 					int mask = STATX_MNT_ID;
 					(void)mask;
 					(void)stx.stx_mnt_id;
 				]])
 			], [
 				AC_MSG_RESULT([yes])
 				AC_DEFINE([HAVE_STATX_MNT_ID], [1], [STATX_MNT_ID is available])
 			], [
 				AC_MSG_RESULT([no])
 			])
 		])
 	], [
-		AC_MSG_WARN([linux/stat.h not found; skipping statx support])
+		AC_MSG_WARN([sys/stat.h not found; skipping statx support])
 	])
 ])  dnl end AC_DEFUN
diff --git a/sys/contrib/openzfs/config/zfs-build.m4 b/sys/contrib/openzfs/config/zfs-build.m4
index 7cf1b02d8757..adf6576f3193 100644
--- a/sys/contrib/openzfs/config/zfs-build.m4
+++ b/sys/contrib/openzfs/config/zfs-build.m4
@@ -1,697 +1,699 @@
 AC_DEFUN([ZFS_AC_LICENSE], [
 	AC_MSG_CHECKING([zfs author])
 	AC_MSG_RESULT([$ZFS_META_AUTHOR])
 
 	AC_MSG_CHECKING([zfs license])
 	AC_MSG_RESULT([$ZFS_META_LICENSE])
 ])
 
 AC_DEFUN([ZFS_AC_DEBUG_ENABLE], [
 	DEBUG_CFLAGS="-Werror"
 	DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG"
 	DEBUG_LDFLAGS=""
 	DEBUG_ZFS="_with_debug"
 	WITH_DEBUG="true"
 	AC_DEFINE(ZFS_DEBUG, 1, [zfs debugging enabled])
 
 	KERNEL_DEBUG_CFLAGS="-Werror"
 	KERNEL_DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG"
 ])
 
 AC_DEFUN([ZFS_AC_DEBUG_DISABLE], [
 	DEBUG_CFLAGS=""
 	DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG"
 	DEBUG_LDFLAGS=""
 	DEBUG_ZFS="_without_debug"
 	WITH_DEBUG=""
 
 	KERNEL_DEBUG_CFLAGS=""
 	KERNEL_DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG"
 ])
 
 dnl #
 dnl # When debugging is enabled:
 dnl # - Enable all ASSERTs (-DDEBUG)
 dnl # - Promote all compiler warnings to errors (-Werror)
 dnl #
 dnl # (If INVARIANTS is detected, we need to force DEBUG, or strange panics
 dnl # can ensue.)
 dnl #
 AC_DEFUN([ZFS_AC_DEBUG], [
 	AC_MSG_CHECKING([whether assertion support will be enabled])
 	AC_ARG_ENABLE([debug],
 		[AS_HELP_STRING([--enable-debug],
 		[Enable compiler and code assertions @<:@default=no@:>@])],
 		[],
 		[enable_debug=no])
 
 	AS_CASE(["x$enable_debug"],
 		["xyes"],
 		[ZFS_AC_DEBUG_ENABLE],
 		["xno"],
 		[ZFS_AC_DEBUG_DISABLE],
 		[AC_MSG_ERROR([Unknown option $enable_debug])])
 
 	AS_CASE(["x$enable_invariants"],
 		["xyes"],
 		[],
 		["xno"],
 		[],
 		[ZFS_AC_DEBUG_INVARIANTS_DETECT])
 
 	AS_CASE(["x$enable_invariants"],
 		["xyes"],
 		[ZFS_AC_DEBUG_ENABLE],
 		["xno"],
 		[],
 		[AC_MSG_ERROR([Unknown option $enable_invariants])])
 
 	AC_SUBST(DEBUG_CFLAGS)
 	AC_SUBST(DEBUG_CPPFLAGS)
 	AC_SUBST(DEBUG_LDFLAGS)
 	AC_SUBST(DEBUG_ZFS)
 	AC_SUBST(WITH_DEBUG)
 
 	AC_SUBST(KERNEL_DEBUG_CFLAGS)
 	AC_SUBST(KERNEL_DEBUG_CPPFLAGS)
 
 	AC_MSG_RESULT([$enable_debug])
 ])
 
 AC_DEFUN([ZFS_AC_DEBUGINFO_ENABLE], [
 	DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline $NO_IPA_SRA"
 
 	KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline $KERNEL_NO_IPA_SRA"
 	KERNEL_MAKE="$KERNEL_MAKE CONFIG_DEBUG_INFO=y"
 
 	DEBUGINFO_ZFS="_with_debuginfo"
 ])
 
 AC_DEFUN([ZFS_AC_DEBUGINFO_DISABLE], [
 	DEBUGINFO_ZFS="_without_debuginfo"
 ])
 
 AC_DEFUN([ZFS_AC_DEBUGINFO], [
 	AC_MSG_CHECKING([whether debuginfo support will be forced])
 	AC_ARG_ENABLE([debuginfo],
 		[AS_HELP_STRING([--enable-debuginfo],
 		[Force generation of debuginfo @<:@default=no@:>@])],
 		[],
 		[enable_debuginfo=no])
 
 	AS_CASE(["x$enable_debuginfo"],
 		["xyes"],
 		[ZFS_AC_DEBUGINFO_ENABLE],
 		["xno"],
 		[ZFS_AC_DEBUGINFO_DISABLE],
 		[AC_MSG_ERROR([Unknown option $enable_debuginfo])])
 
 	AC_SUBST(DEBUG_CFLAGS)
 	AC_SUBST(DEBUGINFO_ZFS)
 
 	AC_SUBST(KERNEL_DEBUG_CFLAGS)
 	AC_SUBST(KERNEL_MAKE)
 
 	AC_MSG_RESULT([$enable_debuginfo])
 ])
 
 dnl #
 dnl # Disabled by default, provides basic memory tracking.  Track the total
 dnl # number of bytes allocated with kmem_alloc() and freed with kmem_free().
 dnl # Then at module unload time if any bytes were leaked it will be reported
 dnl # on the console.
 dnl #
 AC_DEFUN([ZFS_AC_DEBUG_KMEM], [
 	AC_MSG_CHECKING([whether basic kmem accounting is enabled])
 	AC_ARG_ENABLE([debug-kmem],
 		[AS_HELP_STRING([--enable-debug-kmem],
 		[Enable basic kmem accounting @<:@default=no@:>@])],
 		[],
 		[enable_debug_kmem=no])
 
 	AS_IF([test "x$enable_debug_kmem" = xyes], [
 		KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM"
 		DEBUG_KMEM_ZFS="_with_debug_kmem"
 	], [
 		DEBUG_KMEM_ZFS="_without_debug_kmem"
 	])
 
 	AC_SUBST(KERNEL_DEBUG_CPPFLAGS)
 	AC_SUBST(DEBUG_KMEM_ZFS)
 
 	AC_MSG_RESULT([$enable_debug_kmem])
 ])
 
 dnl #
 dnl # Disabled by default, provides detailed memory tracking.  This feature
 dnl # also requires --enable-debug-kmem to be set.  When enabled not only will
 dnl # total bytes be tracked but also the location of every kmem_alloc() and
 dnl # kmem_free().  When the module is unloaded a list of all leaked addresses
 dnl # and where they were allocated will be dumped to the console.  Enabling
 dnl # this feature has a significant impact on performance but it makes finding
 dnl # memory leaks straight forward.
 dnl #
 AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [
 	AC_MSG_CHECKING([whether detailed kmem tracking is enabled])
 	AC_ARG_ENABLE([debug-kmem-tracking],
 		[AS_HELP_STRING([--enable-debug-kmem-tracking],
 		[Enable detailed kmem tracking  @<:@default=no@:>@])],
 		[],
 		[enable_debug_kmem_tracking=no])
 
 	AS_IF([test "x$enable_debug_kmem_tracking" = xyes], [
 		KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM_TRACKING"
 		DEBUG_KMEM_TRACKING_ZFS="_with_debug_kmem_tracking"
 	], [
 		DEBUG_KMEM_TRACKING_ZFS="_without_debug_kmem_tracking"
 	])
 
 	AC_SUBST(KERNEL_DEBUG_CPPFLAGS)
 	AC_SUBST(DEBUG_KMEM_TRACKING_ZFS)
 
 	AC_MSG_RESULT([$enable_debug_kmem_tracking])
 ])
 
 AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD], [
 	AS_IF([sysctl -n kern.conftxt | grep -Fqx $'options\tINVARIANTS'],
 		[enable_invariants="yes"],
 		[enable_invariants="no"])
 ])
 
 AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS_DETECT], [
 	AM_COND_IF([BUILD_FREEBSD],
 		[ZFS_AC_DEBUG_INVARIANTS_DETECT_FREEBSD],
 		[enable_invariants="no"])
 ])
 
 dnl #
 dnl # Detected for the running kernel by default, enables INVARIANTS features
 dnl # in the FreeBSD kernel module.  This feature must be used when building
 dnl # for a FreeBSD kernel with "options INVARIANTS" in the KERNCONF and must
 dnl # not be used when the INVARIANTS option is absent.
 dnl #
 AC_DEFUN([ZFS_AC_DEBUG_INVARIANTS], [
 	AC_MSG_CHECKING([whether FreeBSD kernel INVARIANTS checks are enabled])
 	AC_ARG_ENABLE([invariants],
 		[AS_HELP_STRING([--enable-invariants],
 		[Enable FreeBSD kernel INVARIANTS checks [[default: detect]]])],
 		[], [ZFS_AC_DEBUG_INVARIANTS_DETECT])
 
 	AS_IF([test "x$enable_invariants" = xyes],
 		[WITH_INVARIANTS="true"],
 		[WITH_INVARIANTS=""])
 	AC_SUBST(WITH_INVARIANTS)
 
 	AC_MSG_RESULT([$enable_invariants])
 ])
 
 dnl # Disabled by default. If enabled allows a configured "turn objtools
 dnl # warnings into errors" (CONFIG_OBJTOOL_WERROR) behavior to take effect.
 dnl # If disabled, objtool warnings are never turned into errors. It can't
 dnl # be enabled if the kernel wasn't compiled with CONFIG_OBJTOOL_WERROR=y.
 dnl #
 AC_DEFUN([ZFS_AC_OBJTOOL_WERROR], [
 	AC_MSG_CHECKING([whether objtool error on warning behavior is enabled])
 	AC_ARG_ENABLE([objtool-werror],
 		[AS_HELP_STRING([--enable-objtool-werror],
 		[Enable objtool's error on warning behaviour if present @<:@default=no@:>@])],
 		[enable_objtool_werror=$enableval],
 		[enable_objtool_werror=no])
 	AC_MSG_RESULT([$enable_objtool_werror])
 
 	AS_IF([test x$CONFIG_OBJTOOL_WERROR_DEFINED = xyes],[
 		AS_IF([test x$enable_objtool_werror = xyes],[
 			AC_MSG_NOTICE([enable-objtool-werror defined, keeping -Werror ])
 		],[
 			AC_MSG_NOTICE([enable-objtool-werror undefined, disabling -Werror ])
 			OBJTOOL_DISABLE_WERROR=y
 			abs_objtool_binary=$kernelsrc/tools/objtool/objtool
 			AS_IF([test -x $abs_objtool_binary],[],[
 				AC_MSG_ERROR([*** objtool binary $abs_objtool_binary not found])
 			])
 			dnl # The path to the wrapper is defined in modules/Makefile.in.
 		])
 	],[
 		dnl # We can't enable --Werror if it's not there.
 		AS_IF([test x$enable_objtool_werror = xyes],[
 			AC_MSG_ERROR([
 	*** Cannot enable objtool-werror,
 	*** a kernel built with CONFIG_OBJTOOL_WERROR=y is required.
 			])
 		],[])
 	])
 
 	AC_SUBST(OBJTOOL_DISABLE_WERROR)
 	AC_SUBST(abs_objtool_binary)
 ])
 
 AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [
 	AX_COUNT_CPUS([])
 	AC_SUBST(CPU_COUNT)
 
 	ZFS_AC_CONFIG_ALWAYS_CC_NO_CLOBBERED
 	ZFS_AC_CONFIG_ALWAYS_CC_INFINITE_RECURSION
 	ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_INFINITE_RECURSION
 	ZFS_AC_CONFIG_ALWAYS_CC_IMPLICIT_FALLTHROUGH
 	ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN
 	ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION
 	ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH
+	ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_FORMAT_ZERO_LENGTH
 	ZFS_AC_CONFIG_ALWAYS_CC_FORMAT_OVERFLOW
 	ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER
 	ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA
 	ZFS_AC_CONFIG_ALWAYS_KERNEL_CC_NO_IPA_SRA
 	ZFS_AC_CONFIG_ALWAYS_CC_ASAN
 	ZFS_AC_CONFIG_ALWAYS_CC_UBSAN
 	ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD
 	ZFS_AC_CONFIG_ALWAYS_SYSTEM
 	ZFS_AC_CONFIG_ALWAYS_ARCH
+	ZFS_AC_CONFIG_CHECK_ARCH_VAR
 	ZFS_AC_CONFIG_ALWAYS_PYTHON
 	ZFS_AC_CONFIG_ALWAYS_PYZFS
 	ZFS_AC_CONFIG_ALWAYS_SED
 	ZFS_AC_CONFIG_ALWAYS_CPPCHECK
 	ZFS_AC_CONFIG_ALWAYS_SHELLCHECK
 	ZFS_AC_CONFIG_ALWAYS_PARALLEL
 ])
 
 AC_DEFUN([ZFS_AC_CONFIG], [
 
         dnl # Remove the previous build test directory.
         rm -Rf build
 
 	ZFS_CONFIG=all
 	AC_ARG_WITH([config],
 		AS_HELP_STRING([--with-config=CONFIG],
 		[Config file 'kernel|user|all|srpm']),
 		[ZFS_CONFIG="$withval"])
 	AC_ARG_ENABLE([linux-builtin],
 		[AS_HELP_STRING([--enable-linux-builtin],
 		[Configure for builtin in-tree kernel modules @<:@default=no@:>@])],
 		[],
 		[enable_linux_builtin=no])
 
 	AC_MSG_CHECKING([zfs config])
 	AC_MSG_RESULT([$ZFS_CONFIG]);
 	AC_SUBST(ZFS_CONFIG)
 
 	ZFS_AC_CONFIG_ALWAYS
 
 	AM_COND_IF([BUILD_LINUX], [
 		AC_ARG_VAR([TEST_JOBS], [simultaneous jobs during configure])
 		if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then
 			TEST_JOBS=$CPU_COUNT
 		fi
 		AC_SUBST(TEST_JOBS)
 	])
 
 	ZFS_INIT_SYSV=
 	ZFS_INIT_SYSTEMD=
 	ZFS_WANT_MODULES_LOAD_D=
 
 	case "$ZFS_CONFIG" in
 		kernel) ZFS_AC_CONFIG_KERNEL ;;
 		user)	ZFS_AC_CONFIG_USER   ;;
 		all)    ZFS_AC_CONFIG_USER
 			ZFS_AC_CONFIG_KERNEL ;;
 		dist)                        ;;
 		srpm)                        ;;
 		*)
 		AC_MSG_RESULT([Error!])
 		AC_MSG_ERROR([Bad value "$ZFS_CONFIG" for --with-config,
 		              user kernel|user|all|srpm]) ;;
 	esac
 
 	AM_CONDITIONAL([INIT_SYSV],           [test "x$ZFS_INIT_SYSV" = "xyes"])
 	AM_CONDITIONAL([INIT_SYSTEMD],        [test "x$ZFS_INIT_SYSTEMD" = "xyes"])
 	AM_CONDITIONAL([WANT_MODULES_LOAD_D], [test "x$ZFS_WANT_MODULES_LOAD_D" = "xyes"])
 
 	AM_CONDITIONAL([CONFIG_USER],
 	    [test "$ZFS_CONFIG" = user -o "$ZFS_CONFIG" = all])
 	AM_CONDITIONAL([CONFIG_KERNEL],
 	    [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] &&
 	    [test "x$enable_linux_builtin" != xyes ])
 	AM_CONDITIONAL([CONFIG_QAT],
 	    [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] &&
 	    [test "x$qatsrc" != x ])
 	AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ])
 	AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ])
 	AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes])
 ])
 
 dnl #
 dnl # Check for rpm+rpmbuild to build RPM packages.  If these tools
 dnl # are missing it is non-fatal but you will not be able to build
 dnl # RPM packages and will be warned if you try too.
 dnl #
 dnl # By default the generic spec file will be used because it requires
 dnl # minimal dependencies.  Distribution specific spec files can be
 dnl # placed under the 'rpm/<distribution>' directory and enabled using
 dnl # the --with-spec=<distribution> configure option.
 dnl #
 AC_DEFUN([ZFS_AC_RPM], [
 	RPM=rpm
 	RPMBUILD=rpmbuild
 
 	AC_MSG_CHECKING([whether $RPM is available])
 	AS_IF([tmp=$($RPM --version 2>/dev/null)], [
 		RPM_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }')
 		HAVE_RPM=yes
 		AC_MSG_RESULT([$HAVE_RPM ($RPM_VERSION)])
 	],[
 		HAVE_RPM=no
 		AC_MSG_RESULT([$HAVE_RPM])
 	])
 
 	AC_MSG_CHECKING([whether $RPMBUILD is available])
 	AS_IF([tmp=$($RPMBUILD --version 2>/dev/null)], [
 		RPMBUILD_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }')
 		HAVE_RPMBUILD=yes
 		AC_MSG_RESULT([$HAVE_RPMBUILD ($RPMBUILD_VERSION)])
 	],[
 		HAVE_RPMBUILD=no
 		AC_MSG_RESULT([$HAVE_RPMBUILD])
 	])
 
 	RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"'
 	RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUGINFO_ZFS) 1"'
 	RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_ZFS) 1"'
 	RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"'
 	RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"'
 	RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(UBSAN_ZFS) 1"'
 
 	AS_IF([test "x$enable_debuginfo" = xyes], [
 		RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "__strip /bin/true"'
 	])
 
 	RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"'
 
 	dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since
 	dnl # their values may not be set when running:
 	dnl #
 	dnl #	./configure --with-config=srpm
 	dnl #
 	AS_IF([test -n "$dracutdir" ], [
 		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_dracutdir $(dracutdir)"'
 	])
 	AS_IF([test -n "$udevdir" ], [
 		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevdir $(udevdir)"'
 	])
 	AS_IF([test -n "$udevruledir" ], [
 		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"'
 	])
 	AS_IF([test -n "$bashcompletiondir" ], [
 		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_bashcompletiondir $(bashcompletiondir)"'
 	])
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)'
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)'
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)'
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_VERSION)'
 	RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_PKG_VERSION)'
 
 	dnl # Override default lib directory on Debian/Ubuntu systems.  The
 	dnl # provided /usr/lib/rpm/platform/<arch>/macros files do not
 	dnl # specify the correct path for multiarch systems as described
 	dnl # by the packaging guidelines.
 	dnl #
 	dnl # https://wiki.ubuntu.com/MultiarchSpec
 	dnl # https://wiki.debian.org/Multiarch/Implementation
 	dnl #
 	AS_IF([test "$DEFAULT_PACKAGE" = "deb"], [
 		MULTIARCH_LIBDIR="lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)"
 		RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_lib $(MULTIARCH_LIBDIR)"'
 		AC_SUBST(MULTIARCH_LIBDIR)
 	])
 
 	dnl # Make RPM_DEFINE_KMOD additions conditional on CONFIG_KERNEL,
 	dnl # since the values will not be set otherwise. The spec files
 	dnl # provide defaults for them.
 	dnl #
 	RPM_DEFINE_KMOD='--define "_wrong_version_format_terminate_build 0"'
 	AM_COND_IF([CONFIG_KERNEL], [
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernels $(LINUX_VERSION)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "ksrc $(LINUX)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kobj $(LINUX_OBJ)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cc KERNEL_CC=$(KERNEL_CC)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_ld KERNEL_LD=$(KERNEL_LD)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_llvm KERNEL_LLVM=$(KERNEL_LLVM)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_cross_compile KERNEL_CROSS_COMPILE=$(KERNEL_CROSS_COMPILE)"'
 		RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernel_arch KERNEL_ARCH=$(KERNEL_ARCH)"'
 	])
 
 	RPM_DEFINE_DKMS=''
 
 	SRPM_DEFINE_COMMON='--define "build_src_rpm 1"'
 	SRPM_DEFINE_UTIL=
 	SRPM_DEFINE_KMOD=
 	SRPM_DEFINE_DKMS=
 
 	RPM_SPEC_DIR="rpm/generic"
 	AC_ARG_WITH([spec],
 		AS_HELP_STRING([--with-spec=SPEC],
 		[Spec files 'generic|redhat']),
 		[RPM_SPEC_DIR="rpm/$withval"])
 
 	AC_MSG_CHECKING([whether spec files are available])
 	AC_MSG_RESULT([yes ($RPM_SPEC_DIR/*.spec.in)])
 
 	AC_SUBST(HAVE_RPM)
 	AC_SUBST(RPM)
 	AC_SUBST(RPM_VERSION)
 
 	AC_SUBST(HAVE_RPMBUILD)
 	AC_SUBST(RPMBUILD)
 	AC_SUBST(RPMBUILD_VERSION)
 
 	AC_SUBST(RPM_SPEC_DIR)
 	AC_SUBST(RPM_DEFINE_UTIL)
 	AC_SUBST(RPM_DEFINE_KMOD)
 	AC_SUBST(RPM_DEFINE_DKMS)
 	AC_SUBST(RPM_DEFINE_COMMON)
 	AC_SUBST(SRPM_DEFINE_UTIL)
 	AC_SUBST(SRPM_DEFINE_KMOD)
 	AC_SUBST(SRPM_DEFINE_DKMS)
 	AC_SUBST(SRPM_DEFINE_COMMON)
 ])
 
 dnl #
 dnl # Check for dpkg+dpkg-buildpackage to build DEB packages.  If these
 dnl # tools are missing it is non-fatal but you will not be able to build
 dnl # DEB packages and will be warned if you try too.
 dnl #
 AC_DEFUN([ZFS_AC_DPKG], [
 	DPKG=dpkg
 	DPKGBUILD=dpkg-buildpackage
 
 	AC_MSG_CHECKING([whether $DPKG is available])
 	AS_IF([tmp=$($DPKG --version 2>/dev/null)], [
 		DPKG_VERSION=$(echo $tmp | $AWK '/Debian/ { print $[7] }')
 		HAVE_DPKG=yes
 		AC_MSG_RESULT([$HAVE_DPKG ($DPKG_VERSION)])
 	],[
 		HAVE_DPKG=no
 		AC_MSG_RESULT([$HAVE_DPKG])
 	])
 
 	AC_MSG_CHECKING([whether $DPKGBUILD is available])
 	AS_IF([tmp=$($DPKGBUILD --version 2>/dev/null)], [
 		DPKGBUILD_VERSION=$(echo $tmp | \
 		    $AWK '/Debian/ { print $[4] }' | cut -f-4 -d'.')
 		HAVE_DPKGBUILD=yes
 		AC_MSG_RESULT([$HAVE_DPKGBUILD ($DPKGBUILD_VERSION)])
 	],[
 		HAVE_DPKGBUILD=no
 		AC_MSG_RESULT([$HAVE_DPKGBUILD])
 	])
 
 	AC_SUBST(HAVE_DPKG)
 	AC_SUBST(DPKG)
 	AC_SUBST(DPKG_VERSION)
 
 	AC_SUBST(HAVE_DPKGBUILD)
 	AC_SUBST(DPKGBUILD)
 	AC_SUBST(DPKGBUILD_VERSION)
 	AC_SUBST([CFGOPTS], ["$CFGOPTS"])
 ])
 
 dnl #
 dnl # Until native packaging for various different packing systems
 dnl # can be added the least we can do is attempt to use alien to
 dnl # convert the RPM packages to the needed package type.  This is
 dnl # a hack but so far it has worked reasonable well.
 dnl #
 AC_DEFUN([ZFS_AC_ALIEN], [
 	ALIEN=alien
 
 	AC_MSG_CHECKING([whether $ALIEN is available])
 	AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [
 		ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }')
 		ALIEN_MAJOR=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[1] }')
 		ALIEN_MINOR=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[2] }')
 		ALIEN_POINT=$(echo ${ALIEN_VERSION} | $AWK -F'.' '{ print $[3] }')
 		HAVE_ALIEN=yes
 		AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)])
 	],[
 		HAVE_ALIEN=no
 		AC_MSG_RESULT([$HAVE_ALIEN])
 	])
 
 	AC_SUBST(HAVE_ALIEN)
 	AC_SUBST(ALIEN)
 	AC_SUBST(ALIEN_VERSION)
 	AC_SUBST(ALIEN_MAJOR)
 	AC_SUBST(ALIEN_MINOR)
 	AC_SUBST(ALIEN_POINT)
 ])
 
 dnl #
 dnl # Using the VENDOR tag from config.guess set the default
 dnl # package type for 'make pkg': (rpm | deb | tgz)
 dnl #
 AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [
 	AC_MSG_CHECKING([os distribution])
 	AC_ARG_WITH([vendor],
 		[AS_HELP_STRING([--with-vendor],
 			[Distribution vendor @<:@default=check@:>@])],
 		[with_vendor=$withval],
 		[with_vendor=check])
 	AS_IF([test "x$with_vendor" = "xcheck"],[
 		if test -f /etc/alpine-release ; then
 			VENDOR=alpine ;
 		elif test -f /etc/arch-release ; then
 			VENDOR=arch ;
 		elif test -f /etc/artix-release ; then
 			VENDOR=artix ;
 		elif test -f /etc/fedora-release ; then
 			VENDOR=fedora ;
 		elif test -f /bin/freebsd-version ; then
 			VENDOR=freebsd ;
 		elif test -f /etc/gentoo-release ; then
 			VENDOR=gentoo ;
 		elif test -f /etc/lunar.release ; then
 			VENDOR=lunar ;
 		elif test -f /etc/openEuler-release ; then
 			VENDOR=openeuler ;
 		elif test -f /etc/SuSE-release ; then
 			VENDOR=sles ;
 		elif test -f /etc/slackware-version ; then
 			VENDOR=slackware ;
 		elif test -f /etc/toss-release ; then
 			VENDOR=toss ;
 		elif test -f /etc/lsb-release ; then
 			VENDOR=ubuntu ;
 		# put debian and redhat last as derivatives may have also their file
 		elif test -f /etc/debian_version ; then
 			VENDOR=debian ;
 		elif test -f /etc/redhat-release ; then
 			VENDOR=redhat ;
 		else
 			VENDOR= ;
 		fi],
 		[ test "x${with_vendor}" != x],[
 			VENDOR="$with_vendor" ],
 		[ VENDOR= ; ]
 	)
 	AC_MSG_RESULT([$VENDOR])
 	AC_SUBST(VENDOR)
 
 	AC_MSG_CHECKING([default package type])
 	case "$VENDOR" in
 		alpine|arch|artix|gentoo|lunar|slackware)
 			DEFAULT_PACKAGE=tgz  ;;
 		debian|ubuntu)
 			DEFAULT_PACKAGE=deb  ;;
 		freebsd)
 			DEFAULT_PACKAGE=pkg  ;;
 		*)
 		# fedora|openeuler|redhat|sles|toss
 			DEFAULT_PACKAGE=rpm  ;;
 	esac
 	AC_MSG_RESULT([$DEFAULT_PACKAGE])
 	AC_SUBST(DEFAULT_PACKAGE)
 
 	AC_MSG_CHECKING([default init directory])
 	case "$VENDOR" in
 		freebsd)    initdir=$sysconfdir/rc.d  ;;
 		*)          initdir=$sysconfdir/init.d;;
 	esac
 	AC_MSG_RESULT([$initdir])
 	AC_SUBST(initdir)
 
 	AC_MSG_CHECKING([default shell])
 	case "$VENDOR" in
 		alpine|gentoo)	DEFAULT_INIT_SHELL=/sbin/openrc-run
 				IS_SYSV_RC=false	;;
 		artix)		DEFAULT_INIT_SHELL=/usr/bin/openrc-run
 				IS_SYSV_RC=false	;;
 		*)		DEFAULT_INIT_SHELL=/bin/sh
 				IS_SYSV_RC=true		;;
 	esac
 
 	AC_MSG_RESULT([$DEFAULT_INIT_SHELL])
 	AC_SUBST(DEFAULT_INIT_SHELL)
 	AC_SUBST(IS_SYSV_RC)
 
 	AC_MSG_CHECKING([default nfs server init script])
 	AS_IF([test "$VENDOR" = "debian"],
 		[DEFAULT_INIT_NFS_SERVER="nfs-kernel-server"],
 		[DEFAULT_INIT_NFS_SERVER="nfs"]
 	)
 	AC_MSG_RESULT([$DEFAULT_INIT_NFS_SERVER])
 	AC_SUBST(DEFAULT_INIT_NFS_SERVER)
 
 	AC_MSG_CHECKING([default init config directory])
 	case "$VENDOR" in
 		alpine|artix|gentoo)
 			initconfdir=/etc/conf.d
 			;;
 		fedora|openeuler|redhat|sles|toss)
 			initconfdir=/etc/sysconfig
 			;;
 		freebsd)
 			initconfdir=$sysconfdir/rc.conf.d
 			;;
 		*)
 		# debian|ubuntu
 			initconfdir=/etc/default
 			;;
 	esac
 	AC_MSG_RESULT([$initconfdir])
 	AC_SUBST(initconfdir)
 
 	AC_MSG_CHECKING([whether initramfs-tools is available])
 	if test -d /usr/share/initramfs-tools ; then
 		RPM_DEFINE_INITRAMFS='--define "_initramfs 1"'
 		AC_MSG_RESULT([yes])
 	else
 		RPM_DEFINE_INITRAMFS=''
 		AC_MSG_RESULT([no])
 	fi
 	AC_SUBST(RPM_DEFINE_INITRAMFS)
 
 	AC_MSG_CHECKING([default bash completion directory])
 	case "$VENDOR" in
 		alpine|arch|artix|debian|gentoo|ubuntu)
 			bashcompletiondir=/usr/share/bash-completion/completions
 			;;
 		freebsd)
 			bashcompletiondir=$sysconfdir/bash_completion.d
 			;;
 		*)
 			bashcompletiondir=/etc/bash_completion.d
 			;;
 	esac
 	AC_MSG_RESULT([$bashcompletiondir])
 	AC_SUBST(bashcompletiondir)
 
 ])
 
 dnl #
 dnl # Default ZFS package configuration
 dnl #
 AC_DEFUN([ZFS_AC_PACKAGE], [
 	ZFS_AC_DEFAULT_PACKAGE
 	AS_IF([test x$VENDOR != xfreebsd], [
 		ZFS_AC_RPM
 		ZFS_AC_DPKG
 		ZFS_AC_ALIEN
 	])
 ])
diff --git a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install
index 37284a78ad18..2362c83dfa3f 100644
--- a/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install
+++ b/sys/contrib/openzfs/contrib/debian/openzfs-zfsutils.install
@@ -1,142 +1,144 @@
 etc/default/zfs
 etc/zfs/zfs-functions
 etc/zfs/zpool.d/
 lib/systemd/system-generators/
 lib/systemd/system-preset/
 lib/systemd/system/zfs-import-cache.service
 lib/systemd/system/zfs-import-scan.service
 lib/systemd/system/zfs-import.target
 lib/systemd/system/zfs-load-key.service
 lib/systemd/system/zfs-mount.service
 lib/systemd/system/zfs-mount@.service
 lib/systemd/system/zfs-scrub-monthly@.timer
 lib/systemd/system/zfs-scrub-weekly@.timer
 lib/systemd/system/zfs-scrub@.service
 lib/systemd/system/zfs-trim-monthly@.timer
 lib/systemd/system/zfs-trim-weekly@.timer
 lib/systemd/system/zfs-trim@.service
 lib/systemd/system/zfs-share.service
 lib/systemd/system/zfs-volume-wait.service
 lib/systemd/system/zfs-volumes.target
 lib/systemd/system/zfs.target
 lib/udev/
 sbin/fsck.zfs
 sbin/mount.zfs
 sbin/zdb
 sbin/zfs
 sbin/zfs_ids_to_path
 sbin/zgenhostid
 sbin/zhack
 sbin/zinject
 sbin/zpool
 sbin/zstream
 sbin/zstreamdump
 usr/bin/zvol_wait
 usr/lib/modules-load.d/ lib/
 usr/lib/zfs-linux/zpool.d/
 usr/lib/zfs-linux/zpool_influxdb
 usr/lib/zfs-linux/zfs_prepare_disk
 usr/sbin/arc_summary
+usr/sbin/zarcsummary
 usr/sbin/arcstat
+usr/sbin/zarcstat
 usr/sbin/dbufstat
 usr/sbin/zilstat
 usr/share/zfs/compatibility.d/
 usr/share/bash-completion/completions
 usr/share/man/man1/arcstat.1
 usr/share/man/man1/zhack.1
 usr/share/man/man1/zvol_wait.1
 usr/share/man/man5/
 usr/share/man/man8/fsck.zfs.8
 usr/share/man/man8/mount.zfs.8
 usr/share/man/man8/vdev_id.8
 usr/share/man/man8/zdb.8
 usr/share/man/man8/zfs-allow.8
 usr/share/man/man8/zfs-bookmark.8
 usr/share/man/man8/zfs-change-key.8
 usr/share/man/man8/zfs-clone.8
 usr/share/man/man8/zfs-create.8
 usr/share/man/man8/zfs-destroy.8
 usr/share/man/man8/zfs-diff.8
 usr/share/man/man8/zfs-get.8
 usr/share/man/man8/zfs-groupspace.8
 usr/share/man/man8/zfs-hold.8
 usr/share/man/man8/zfs-inherit.8
 usr/share/man/man8/zfs-list.8
 usr/share/man/man8/zfs-load-key.8
 usr/share/man/man8/zfs-mount-generator.8
 usr/share/man/man8/zfs-mount.8
 usr/share/man/man8/zfs-program.8
 usr/share/man/man8/zfs-project.8
 usr/share/man/man8/zfs-projectspace.8
 usr/share/man/man8/zfs-promote.8
 usr/share/man/man8/zfs-receive.8
 usr/share/man/man8/zfs-recv.8
 usr/share/man/man8/zfs-redact.8
 usr/share/man/man8/zfs-release.8
 usr/share/man/man8/zfs-rename.8
 usr/share/man/man8/zfs-rewrite.8
 usr/share/man/man8/zfs-rollback.8
 usr/share/man/man8/zfs-send.8
 usr/share/man/man8/zfs-set.8
 usr/share/man/man8/zfs-share.8
 usr/share/man/man8/zfs-snapshot.8
 usr/share/man/man8/zfs-unallow.8
 usr/share/man/man8/zfs-unload-key.8
 usr/share/man/man8/zfs-unmount.8
 usr/share/man/man8/zfs-unzone.8
 usr/share/man/man8/zfs-upgrade.8
 usr/share/man/man8/zfs-userspace.8
 usr/share/man/man8/zfs-wait.8
 usr/share/man/man8/zfs-zone.8
 usr/share/man/man8/zfs.8
 usr/share/man/man8/zfs_ids_to_path.8
 usr/share/man/man8/zfs_prepare_disk.8
 usr/share/man/man7/zfsconcepts.7
 usr/share/man/man7/zfsprops.7
 usr/share/man/man8/zgenhostid.8
 usr/share/man/man8/zinject.8
 usr/share/man/man8/zpool-add.8
 usr/share/man/man8/zpool-attach.8
 usr/share/man/man8/zpool-checkpoint.8
 usr/share/man/man8/zpool-clear.8
 usr/share/man/man8/zpool-create.8
 usr/share/man/man8/zpool-ddtprune.8
 usr/share/man/man8/zpool-destroy.8
 usr/share/man/man8/zpool-detach.8
 usr/share/man/man8/zpool-ddtprune.8
 usr/share/man/man8/zpool-events.8
 usr/share/man/man8/zpool-export.8
 usr/share/man/man8/zpool-get.8
 usr/share/man/man8/zpool-history.8
 usr/share/man/man8/zpool-import.8
 usr/share/man/man8/zpool-initialize.8
 usr/share/man/man8/zpool-iostat.8
 usr/share/man/man8/zpool-labelclear.8
 usr/share/man/man8/zpool-list.8
 usr/share/man/man8/zpool-offline.8
 usr/share/man/man8/zpool-online.8
 usr/share/man/man8/zpool-prefetch.8
 usr/share/man/man8/zpool-prefetch.8
 usr/share/man/man8/zpool-reguid.8
 usr/share/man/man8/zpool-remove.8
 usr/share/man/man8/zpool-reopen.8
 usr/share/man/man8/zpool-replace.8
 usr/share/man/man8/zpool-resilver.8
 usr/share/man/man8/zpool-scrub.8
 usr/share/man/man8/zpool-set.8
 usr/share/man/man8/zpool-split.8
 usr/share/man/man8/zpool-status.8
 usr/share/man/man8/zpool-sync.8
 usr/share/man/man8/zpool-trim.8
 usr/share/man/man8/zpool-upgrade.8
 usr/share/man/man8/zpool-wait.8
 usr/share/man/man8/zpool.8
 usr/share/man/man7/vdevprops.7
 usr/share/man/man7/zpoolconcepts.7
 usr/share/man/man7/zpoolprops.7
 usr/share/man/man8/zstream.8
 usr/share/man/man8/zstreamdump.8
 usr/share/man/man4/spl.4
 usr/share/man/man4/zfs.4
 usr/share/man/man7/zpool-features.7
 usr/share/man/man8/zpool_influxdb.8
diff --git a/sys/contrib/openzfs/contrib/debian/rules.in b/sys/contrib/openzfs/contrib/debian/rules.in
index 2b0568938b25..966e34bf9dc6 100755
--- a/sys/contrib/openzfs/contrib/debian/rules.in
+++ b/sys/contrib/openzfs/contrib/debian/rules.in
@@ -1,224 +1,226 @@
 #!/usr/bin/make -f
 
 include /usr/share/dpkg/default.mk
 
 LSB_DISTRIBUTOR := $(shell lsb_release -is)
 NAME := $(shell awk '$$1 == "Name:" { print $$2; }' META)
 LINUX_MIN  := $(shell awk '/Linux-Minimum:/{print $$2}' META)
 LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META)
 
 DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \
 		COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md
 
 ifndef KVERS
 KVERS=$(shell uname -r)
 endif
 
 non_epoch_version=$(shell echo $(KVERS) | perl -pe 's/^\d+://')
 PACKAGE=openzfs-zfs
 pmodules = $(PACKAGE)-modules-$(non_epoch_version)
 
 export DEB_BUILD_MAINT_OPTIONS = hardening=+all
 
 NUM_CPUS = $(shell nproc 2>/dev/null)
 PARALLEL = $(subst parallel=,,$(filter parallel=%,$(DEB_BUILD_OPTIONS)))
 NJOBS    = -j$(or $(PARALLEL),$(NUM_CPUS),1)
 
 %:
 	dh $@ --with autoreconf,dkms,python3,sphinxdoc
 
 override_dh_autoreconf:
 	@# Embed the downstream version in the module.
 	@sed -e 's/^Version:.*/Version:       $(DEB_VERSION_UPSTREAM)/' -i.orig META
 
 	dh_autoreconf
 
 override_dh_auto_configure:
 	@# Build the userland, but don't build the kernel modules.
 	dh_auto_configure -- @CFGOPTS@ \
 	  --bindir=/usr/bin \
 	  --sbindir=/sbin \
 	  --libdir=/lib/"$(DEB_HOST_MULTIARCH)" \
 	  --with-udevdir=/lib/udev \
 	  --with-zfsexecdir=/usr/lib/zfs-linux \
 	  --enable-systemd \
 	  --enable-pyzfs \
 	  --with-python=python3 \
 	  --with-pammoduledir='/lib/$(DEB_HOST_MULTIARCH)/security' \
 	  --with-pkgconfigdir='/usr/lib/$(DEB_HOST_MULTIARCH)/pkgconfig' \
 	  --with-systemdunitdir=/lib/systemd/system \
 	  --with-systemdpresetdir=/lib/systemd/system-preset \
 	  --with-systemdgeneratordir=/lib/systemd/system-generators \
 	  --with-config=user
 
 	for i in $(wildcard $(CURDIR)/debian/*.install.in) ; do \
 		basename "$$i" | grep _KVERS_ && continue ; \
 		sed 's/@DEB_HOST_MULTIARCH@/$(DEB_HOST_MULTIARCH)/g' "$$i" > "$${i%%.in}" ; \
 	done
 
 	ln -s '$(CURDIR)/etc/init.d/zfs-import' '$(CURDIR)/debian/openzfs-zfsutils.zfs-import.init'
 	ln -s '$(CURDIR)/etc/init.d/zfs-load-key' '$(CURDIR)/debian/openzfs-zfsutils.zfs-load-key.init'
 	ln -s '$(CURDIR)/etc/init.d/zfs-mount' '$(CURDIR)/debian/openzfs-zfsutils.zfs-mount.init'
 	ln -s '$(CURDIR)/etc/init.d/zfs-share' '$(CURDIR)/debian/openzfs-zfsutils.zfs-share.init'
 	ln -s '$(CURDIR)/etc/init.d/zfs-zed' '$(CURDIR)/debian/openzfs-zfs-zed.zfs-zed.init'
 
 override_dh_gencontrol:
 	dh_gencontrol -- -Vlinux:Recommends="linux-libc-dev (<< $(LINUX_NEXT)~), linux-libc-dev (>= $(LINUX_MIN)~),"
 
 override_dh_auto_build:
 	@# Get a bare copy of the source code for DKMS.
 	@# This creates the $(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/ tree, which does not
 	@# contain the userland sources. NB: Remove-userland-dist-rules.patch
 	$(MAKE) distdir
 
 	dh_auto_build
 
 override_dh_auto_install:
 	@# Install the utilities.
 	$(MAKE) install DESTDIR='$(CURDIR)/debian/tmp'
 
 	# Move from bin_dir to /usr/sbin
 	# Remove suffix (.py) as per policy 10.4 - Scripts
 	# https://www.debian.org/doc/debian-policy/ch-files.html#s-scripts
 	mkdir -p '$(CURDIR)/debian/tmp/usr/sbin/'
 	mv '$(CURDIR)/debian/tmp/usr/bin/arc_summary' '$(CURDIR)/debian/tmp/usr/sbin/arc_summary'
+	mv '$(CURDIR)/debian/tmp/usr/bin/zarcsummary' '$(CURDIR)/debian/tmp/usr/sbin/zarcsummary'
 	mv '$(CURDIR)/debian/tmp/usr/bin/arcstat' '$(CURDIR)/debian/tmp/usr/sbin/arcstat'
+	mv '$(CURDIR)/debian/tmp/usr/bin/zarcstat' '$(CURDIR)/debian/tmp/usr/sbin/zarcstat'
 	mv '$(CURDIR)/debian/tmp/usr/bin/dbufstat' '$(CURDIR)/debian/tmp/usr/sbin/dbufstat'
 	mv '$(CURDIR)/debian/tmp/usr/bin/zilstat' '$(CURDIR)/debian/tmp/usr/sbin/zilstat'
 
 	@# Zed has dependencies outside of the system root.
 	mv '$(CURDIR)/debian/tmp/sbin/zed' '$(CURDIR)/debian/tmp/usr/sbin/zed'
 	sed -i 's|ExecStart=/sbin/|ExecStart=/usr/sbin/|g' '$(CURDIR)/debian/tmp/lib/systemd/system/zfs-zed.service'
 
 	@# Install the DKMS source.
 	@# We only want the files needed to build the modules
 	install -D -t '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/scripts' \
 		'$(CURDIR)/scripts/dkms.postbuild' '$(CURDIR)/scripts/objtool-wrapper.in'
 	$(foreach file,$(DKMSFILES),mv '$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/$(file)' '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)' || exit 1;)
 
 	@# Only ever build Linux modules
 	echo 'SUBDIRS = linux' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/include/os/Makefile.am'
 
 	@# Hellish awk line:
 	@#  * Deletes from configure.ac the parts not needed for building the kernel module
 	@#     * It deletes from inside AC_CONFIG_FILES([]) everything except:
 	@#        - Makefile$
 	@#        - include/(Makefile|sys|os/(Makefile|linux))
 	@#        - module/
 	@#        - zfs.release$
 	@#  * Takes care of spaces and tabs
 	@#  * Remove reference to ZFS_AC_PACKAGE
 	awk '/^AC_CONFIG_FILES\(\[/,/\]\)/ {\
 		if ($$0 !~ /^(AC_CONFIG_FILES\(\[([ \t]+)?$$|\]\)([ \t]+)?$$|([ \t]+)?(include\/(Makefile|sys|os\/(Makefile|linux))|module\/|Makefile([ \t]+)?$$|zfs\.release([ \t]+)?$$))|scripts\/objtool-wrapper.*\]\)$$/) \
 		{next} } {print}' \
 		'$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac' | sed '/ZFS_AC_PACKAGE/d' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/configure.ac'
 	@# Set "SUBDIRS = module include" for CONFIG_KERNEL and remove SUBDIRS for all other configs.
 	@# Do not regenerate zfs_gitrev.h during dkms build
 	sed '1,/CONFIG_KERNEL/s/SUBDIRS.*=.*//g;s/SUBDIRS.*=.*/SUBDIRS = module include/g;/make_gitrev.sh/d' \
 		'$(CURDIR)/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am' > '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am'
 	@# Sanity test
 	grep -q 'SUBDIRS = module include' '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am'
 	sed -i '/rpm.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/cmd.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/contrib.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/etc.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/lib.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/man.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/scripts.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/tests.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	sed -i '/udev.Makefile/d' $(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/Makefile.am
 	@# Run autogen on the stripped source tree
 	cd '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)'; ./autogen.sh
 	rm -fr '$(CURDIR)/debian/tmp/usr/src/$(NAME)-$(DEB_VERSION_UPSTREAM)/autom4te.cache'
 
 	for i in `ls $(CURDIR)/debian/tmp/lib/$(DEB_HOST_MULTIARCH)/*.so`; do \
 		ln -s '/lib/$(DEB_HOST_MULTIARCH)/'`readlink $${i}` '$(CURDIR)/debian/tmp/usr/lib/$(DEB_HOST_MULTIARCH)/'`basename $${i}`; \
 		rm $${i}; \
 	done
 
 	chmod a-x '$(CURDIR)/debian/tmp/etc/zfs/zfs-functions'
 	chmod a-x '$(CURDIR)/debian/tmp/etc/default/zfs'
 
 override_dh_python3:
 	dh_python3 -p openzfs-python3-pyzfs
 
 override_dh_dkms:
 	'$(CURDIR)/scripts/dkms.mkconf' -n $(NAME) -v $(DEB_VERSION_UPSTREAM) -f '$(CURDIR)/scripts/zfs-dkms.dkms'
 	dh_dkms
 	rm -f '$(CURDIR)/scripts/zfs-dkms.dkms'
 
 override_dh_makeshlibs:
 	dh_makeshlibs -a -V
 
 override_dh_strip:
 	dh_strip
 
 override_dh_auto_clean:
 	rm -rf zfs-$(DEB_VERSION_UPSTREAM)
 	dh_auto_clean
 	@if test -e META.orig; then mv META.orig META; fi
 
 override_dh_install:
 	find debian/tmp/lib -name '*.la' -delete
 	dh_install
 
 override_dh_missing:
 	dh_missing --fail-missing
 
 override_dh_installinit:
 	dh_installinit -r --no-restart-after-upgrade --name zfs-import
 	dh_installinit -r --no-restart-after-upgrade --name zfs-mount
 	dh_installinit -r --no-restart-after-upgrade --name zfs-load-key
 	dh_installinit -R --name zfs-share
 	dh_installinit -R --name zfs-zed
 
 override_dh_installsystemd:
 	mkdir -p debian/openzfs-zfsutils/lib/systemd/system
 	ln -sr /dev/null debian/openzfs-zfsutils/lib/systemd/system/zfs-import.service
 	dh_installsystemd --no-stop-on-upgrade -X zfs-zed.service
 	dh_installsystemd --name zfs-zed
 
 override_dh_installdocs:
 	dh_installdocs -A
 ifeq (,$(findstring nodoc, $(DEB_BUILD_OPTIONS)))
 	http_proxy='127.0.0.1:9' sphinx-build -N -bhtml "$(CURDIR)/contrib/pyzfs/docs/source/" debian/openzfs-pyzfs-doc/usr/share/doc/openzfs-pyzfs-doc/html/
 endif
 
 # ------------
 
 override_dh_prep-deb-files:
 	for templ in $(wildcard $(CURDIR)/debian/*_KVERS_*.in); do \
 		sed -e 's/##KVERS##/$(KVERS)/g ; s/#KVERS#/$(KVERS)/g ; s/_KVERS_/$(KVERS)/g ; s/##KDREV##/$(KDREV)/g ; s/#KDREV#/$(KDREV)/g ; s/_KDREV_/$(KDREV)/g ; s/_ARCH_/$(DEB_HOST_ARCH)/' \
 		< $$templ > `echo $$templ | sed -e 's/_KVERS_/$(KVERS)/g ; s/_ARCH_/$(DEB_HOST_ARCH)/g ; s/\.in$$//'` ; \
 	done
 	sed -e 's/##KVERS##/$(KVERS)/g ; s/#KVERS#/$(KVERS)/g ; s/_KVERS_/$(KVERS)/g ; s/##KDREV##/$(KDREV)/g ; s/#KDREV#/$(KDREV)/g ; s/_KDREV_/$(KDREV)/g ; s/_ARCH_/$(DEB_HOST_ARCH)/g' \
 	< debian/control.modules.in > debian/control
 
 override_dh_configure_modules: override_dh_configure_modules_stamp
 override_dh_configure_modules_stamp:
 	./configure @CFGOPTS@ \
 		--with-config=kernel \
 		--with-linux=$(KSRC) \
 		--with-linux-obj=$(KOBJ)
 	touch override_dh_configure_modules_stamp
 
 override_dh_binary-modules: override_dh_prep-deb-files override_dh_configure_modules
 	dh_testdir
 	dh_testroot
 	dh_prep
 
 	$(MAKE) $(NJOBS) -C $(CURDIR)/module modules
 
 	dh_install -p${pmodules}
 	dh_installdocs -p${pmodules}
 	dh_installchangelogs -p${pmodules}
 	dh_compress -p${pmodules}
 	dh_strip -p${pmodules}
 	dh_fixperms -p${pmodules}
 	dh_installdeb -p${pmodules}
 	dh_gencontrol -p${pmodules}
 	dh_md5sums -p${pmodules}
 	dh_builddeb -p${pmodules}
 
 debian-copyright:
 	cme update dpkg-copyright -file debian/copyright.cme
diff --git a/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in b/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in
index 4776087d9a76..db9bf0e20274 100644
--- a/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in
+++ b/sys/contrib/openzfs/contrib/initramfs/hooks/zfsunlock.in
@@ -1,10 +1,19 @@
 #!/bin/sh
 
 if [ "$1" = "prereqs" ]; then
 	echo "dropbear"
 	exit
 fi
 
 . /usr/share/initramfs-tools/hook-functions
 
 copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin/zfsunlock
+
+if [ -f /etc/initramfs-tools/etc/motd ]; then
+	copy_file text /etc/initramfs-tools/etc/motd /etc/motd
+else
+	tmpf=$(mktemp)
+	echo "If you use zfs encrypted root filesystems, you can use \`zfsunlock\` to manually unlock it" > "$tmpf"
+	copy_file text "$tmpf" /etc/motd
+	rm -f "$tmpf"
+fi
diff --git a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c
index a0bc172c6f44..88698dedabbc 100644
--- a/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c
+++ b/sys/contrib/openzfs/contrib/pam_zfs_key/pam_zfs_key.c
@@ -1,1088 +1,1092 @@
 // SPDX-License-Identifier: BSD-3-Clause
 /*
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *     * Redistributions of source code must retain the above copyright
  *       notice, this list of conditions and the following disclaimer.
  *     * Redistributions in binary form must reproduce the above copyright
  *       notice, this list of conditions and the following disclaimer in the
  *       documentation and/or other materials provided with the distribution.
  *     * Neither the name of the <organization> nor the
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (c) 2020, Felix Dörre
  * All rights reserved.
  */
 
 #include <sys/dsl_crypt.h>
 #include <sys/byteorder.h>
 #include <libzfs.h>
 
 #include <syslog.h>
 
 #include <sys/zio_crypt.h>
 #include <openssl/evp.h>
 
 #define	PAM_SM_AUTH
 #define	PAM_SM_PASSWORD
 #define	PAM_SM_SESSION
 #include <security/pam_modules.h>
 
 #if	defined(__linux__)
 #include <security/pam_ext.h>
 #define	MAP_FLAGS MAP_PRIVATE | MAP_ANONYMOUS
 #elif	defined(__FreeBSD__)
 #include <security/pam_appl.h>
 static void
 pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...)
 {
 	(void) pamh;
 	va_list args;
 	va_start(args, fmt);
 	vsyslog(loglevel, fmt, args);
 	va_end(args);
 }
 #define	MAP_FLAGS MAP_PRIVATE | MAP_ANON | MAP_NOCORE
 #endif
 
 #include <string.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/file.h>
 #include <sys/wait.h>
 #include <pwd.h>
 #include <lib/libzfs/libzfs_impl.h>
 
 #include <sys/mman.h>
 
 static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok";
 static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok";
 
 static libzfs_handle_t *g_zfs;
 
 static void destroy_pw(pam_handle_t *pamh, void *data, int errcode);
 
 typedef int (*mlock_func_t) (const void *, size_t);
 
 typedef struct {
 	size_t len;
 	char *value;
 } pw_password_t;
 
 /*
  * Try to mlock(2) or munlock(2) addr while handling EAGAIN by retrying ten
  * times and sleeping 10 milliseconds in between for a total of 0.1
  * seconds. lock_func must point to either mlock(2) or munlock(2).
  */
 static int
 try_lock(mlock_func_t lock_func, const void *addr, size_t len)
 {
 	int err;
 	int retries = 10;
 	useconds_t sleep_dur = 10 * 1000;
 
 	if ((err = (*lock_func)(addr, len)) != EAGAIN) {
 		return (err);
 	}
 	for (int i = retries; i > 0; --i) {
 		(void) usleep(sleep_dur);
 		if ((err = (*lock_func)(addr, len)) != EAGAIN) {
 			break;
 		}
 	}
 	return (err);
 }
 
 
 static pw_password_t *
 alloc_pw_size(size_t len)
 {
 	pw_password_t *pw = malloc(sizeof (pw_password_t));
 	if (!pw) {
 		return (NULL);
 	}
 	pw->len = len;
 	/*
 	 * We use mmap(2) rather than malloc(3) since later on we mlock(2) the
 	 * memory region. Since mlock(2) and munlock(2) operate on whole memory
 	 * pages we should allocate a whole page here as mmap(2) does. Further
 	 * this ensures that the addresses passed to mlock(2) an munlock(2) are
 	 * on a page boundary as suggested by FreeBSD and required by some
 	 * other implementations. Finally we avoid inadvertently munlocking
 	 * memory mlocked by an concurrently running instance of us.
 	 */
 	pw->value = mmap(NULL, pw->len, PROT_READ | PROT_WRITE, MAP_FLAGS,
 	    -1, 0);
 
 	if (pw->value == MAP_FAILED) {
 		free(pw);
 		return (NULL);
 	}
 	if (try_lock(mlock, pw->value, pw->len) != 0) {
 		(void) munmap(pw->value, pw->len);
 		free(pw);
 		return (NULL);
 	}
 	return (pw);
 }
 
 static pw_password_t *
 alloc_pw_string(const char *source)
 {
 	size_t len = strlen(source) + 1;
 	pw_password_t *pw = alloc_pw_size(len);
 
 	if (!pw) {
 		return (NULL);
 	}
 	memcpy(pw->value, source, pw->len);
 	return (pw);
 }
 
 static void
 pw_free(pw_password_t *pw)
 {
 	memset(pw->value, 0, pw->len);
 	if (try_lock(munlock, pw->value, pw->len) == 0) {
 		(void) munmap(pw->value, pw->len);
 	}
 	free(pw);
 }
 
 static pw_password_t *
 pw_fetch(pam_handle_t *pamh, int tok)
 {
 	const char *token;
 	if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) {
 		pam_syslog(pamh, LOG_ERR,
 		    "couldn't get password from PAM stack");
 		return (NULL);
 	}
 	if (!token) {
 		pam_syslog(pamh, LOG_ERR,
 		    "token from PAM stack is null");
 		return (NULL);
 	}
 	return (alloc_pw_string(token));
 }
 
 static const pw_password_t *
 pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name)
 {
 	pw_password_t *pw = pw_fetch(pamh, tok);
 	if (pw == NULL) {
 		return (NULL);
 	}
 	int ret = pam_set_data(pamh, var_name, pw, destroy_pw);
 	if (ret != PAM_SUCCESS) {
 		pw_free(pw);
 		pam_syslog(pamh, LOG_ERR, "pam_set_data failed");
 		return (NULL);
 	}
 	return (pw);
 }
 
 static const pw_password_t *
 pw_get(pam_handle_t *pamh, int tok, const char *var_name)
 {
 	const pw_password_t *authtok = NULL;
 	int ret = pam_get_data(pamh, var_name,
 	    (const void**)(&authtok));
 	if (ret == PAM_SUCCESS)
 		return (authtok);
 	if (ret == PAM_NO_MODULE_DATA)
 		return (pw_fetch_lazy(pamh, tok, var_name));
 	pam_syslog(pamh, LOG_ERR, "password not available");
 	return (NULL);
 }
 
 static int
 pw_clear(pam_handle_t *pamh, const char *var_name)
 {
 	int ret = pam_set_data(pamh, var_name, NULL, NULL);
 	if (ret != PAM_SUCCESS) {
 		pam_syslog(pamh, LOG_ERR, "clearing password failed");
 		return (-1);
 	}
 	return (0);
 }
 
 static void
 destroy_pw(pam_handle_t *pamh, void *data, int errcode)
 {
 	(void) pamh, (void) errcode;
 
 	if (data != NULL) {
 		pw_free((pw_password_t *)data);
 	}
 }
 
 static int
 pam_zfs_init(pam_handle_t *pamh)
 {
 	int error = 0;
 	if ((g_zfs = libzfs_init()) == NULL) {
 		error = errno;
 		pam_syslog(pamh, LOG_ERR, "Zfs initialization error: %s",
 		    libzfs_error_init(error));
 	}
 	return (error);
 }
 
 static void
 pam_zfs_free(void)
 {
 	libzfs_fini(g_zfs);
 }
 
 static pw_password_t *
 prepare_passphrase(pam_handle_t *pamh, zfs_handle_t *ds,
     const char *passphrase, nvlist_t *nvlist)
 {
 	pw_password_t *key = alloc_pw_size(WRAPPING_KEY_LEN);
 	if (!key) {
 		return (NULL);
 	}
 	uint64_t salt;
 	uint64_t iters;
 	if (nvlist != NULL) {
 		int fd = open("/dev/urandom", O_RDONLY);
 		if (fd < 0) {
 			pw_free(key);
 			return (NULL);
 		}
 		int bytes_read = 0;
 		char *buf = (char *)&salt;
 		size_t bytes = sizeof (uint64_t);
 		while (bytes_read < bytes) {
 			ssize_t len = read(fd, buf + bytes_read, bytes
 			    - bytes_read);
 			if (len < 0) {
 				close(fd);
 				pw_free(key);
 				return (NULL);
 			}
 			bytes_read += len;
 		}
 		close(fd);
 
 		if (nvlist_add_uint64(nvlist,
 		    zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt)) {
 			pam_syslog(pamh, LOG_ERR,
 			    "failed to add salt to nvlist");
 			pw_free(key);
 			return (NULL);
 		}
 		iters = DEFAULT_PBKDF2_ITERATIONS;
 		if (nvlist_add_uint64(nvlist, zfs_prop_to_name(
 		    ZFS_PROP_PBKDF2_ITERS), iters)) {
 			pam_syslog(pamh, LOG_ERR,
 			    "failed to add iters to nvlist");
 			pw_free(key);
 			return (NULL);
 		}
 	} else {
 		salt = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_SALT);
 		iters = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_ITERS);
 	}
 
 	salt = LE_64(salt);
 	if (!PKCS5_PBKDF2_HMAC_SHA1((char *)passphrase,
 	    strlen(passphrase), (uint8_t *)&salt,
 	    sizeof (uint64_t), iters, WRAPPING_KEY_LEN,
 	    (uint8_t *)key->value)) {
 		pam_syslog(pamh, LOG_ERR, "pbkdf failed");
 		pw_free(key);
 		return (NULL);
 	}
 	return (key);
 }
 
 static int
 is_key_loaded(pam_handle_t *pamh, const char *ds_name)
 {
 	zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM);
 	if (ds == NULL) {
 		pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name);
 		return (-1);
 	}
 	int keystatus = zfs_prop_get_int(ds, ZFS_PROP_KEYSTATUS);
 	zfs_close(ds);
 	return (keystatus != ZFS_KEYSTATUS_UNAVAILABLE);
 }
 
 static int
 change_key(pam_handle_t *pamh, const char *ds_name,
     const char *passphrase)
 {
 	zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM);
 	if (ds == NULL) {
 		pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name);
 		return (-1);
 	}
 	nvlist_t *nvlist = fnvlist_alloc();
 	pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, nvlist);
 	if (key == NULL) {
 		nvlist_free(nvlist);
 		zfs_close(ds);
 		return (-1);
 	}
 	if (nvlist_add_string(nvlist,
 	    zfs_prop_to_name(ZFS_PROP_KEYLOCATION),
 	    "prompt")) {
 		pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keylocation");
 		pw_free(key);
 		nvlist_free(nvlist);
 		zfs_close(ds);
 		return (-1);
 	}
 	if (nvlist_add_uint64(nvlist,
 	    zfs_prop_to_name(ZFS_PROP_KEYFORMAT),
 	    ZFS_KEYFORMAT_PASSPHRASE)) {
 		pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keyformat");
 		pw_free(key);
 		nvlist_free(nvlist);
 		zfs_close(ds);
 		return (-1);
 	}
 	int ret = lzc_change_key(ds_name, DCP_CMD_NEW_KEY, nvlist,
 	    (uint8_t *)key->value, WRAPPING_KEY_LEN);
 	pw_free(key);
 	if (ret) {
 		pam_syslog(pamh, LOG_ERR, "change_key failed: %d", ret);
 		nvlist_free(nvlist);
 		zfs_close(ds);
 		return (-1);
 	}
 	nvlist_free(nvlist);
 	zfs_close(ds);
 	return (0);
 }
 
 typedef struct {
 	char *homes_prefix;
 	char *runstatedir;
 	char *homedir;
 	char *dsname;
 	uid_t uid_min;
 	uid_t uid_max;
 	uid_t uid;
 	const char *username;
 	boolean_t unmount_and_unload;
 	boolean_t force_unmount;
 	boolean_t recursive_homes;
 	boolean_t mount_recursively;
 } zfs_key_config_t;
 
 static int
 zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config,
     int argc, const char **argv)
 {
+#if	defined(__FreeBSD__)
+	config->homes_prefix = strdup("zroot/home");
+#else
 	config->homes_prefix = strdup("rpool/home");
+#endif
 	if (config->homes_prefix == NULL) {
 		pam_syslog(pamh, LOG_ERR, "strdup failure");
 		return (PAM_SERVICE_ERR);
 	}
 	config->runstatedir = strdup(RUNSTATEDIR "/pam_zfs_key");
 	if (config->runstatedir == NULL) {
 		pam_syslog(pamh, LOG_ERR, "strdup failure");
 		free(config->homes_prefix);
 		return (PAM_SERVICE_ERR);
 	}
 	const char *name;
 	if (pam_get_user(pamh, &name, NULL) != PAM_SUCCESS) {
 		pam_syslog(pamh, LOG_ERR,
 		    "couldn't get username from PAM stack");
 		free(config->runstatedir);
 		free(config->homes_prefix);
 		return (PAM_SERVICE_ERR);
 	}
 	struct passwd *entry = getpwnam(name);
 	if (!entry) {
 		free(config->runstatedir);
 		free(config->homes_prefix);
 		return (PAM_USER_UNKNOWN);
 	}
 	config->uid_min = 1000;
 	config->uid_max = MAXUID;
 	config->uid = entry->pw_uid;
 	config->username = name;
 	config->unmount_and_unload = B_TRUE;
 	config->force_unmount = B_FALSE;
 	config->recursive_homes = B_FALSE;
 	config->mount_recursively = B_FALSE;
 	config->dsname = NULL;
 	config->homedir = NULL;
 	for (int c = 0; c < argc; c++) {
 		if (strncmp(argv[c], "homes=", 6) == 0) {
 			free(config->homes_prefix);
 			config->homes_prefix = strdup(argv[c] + 6);
 		} else if (strncmp(argv[c], "runstatedir=", 12) == 0) {
 			free(config->runstatedir);
 			config->runstatedir = strdup(argv[c] + 12);
 		} else if (strncmp(argv[c], "uid_min=", 8) == 0) {
 			sscanf(argv[c] + 8, "%u", &config->uid_min);
 		} else if (strncmp(argv[c], "uid_max=", 8) == 0) {
 			sscanf(argv[c] + 8, "%u", &config->uid_max);
 		} else if (strcmp(argv[c], "nounmount") == 0) {
 			config->unmount_and_unload = B_FALSE;
 		} else if (strcmp(argv[c], "forceunmount") == 0) {
 			config->force_unmount = B_TRUE;
 		} else if (strcmp(argv[c], "recursive_homes") == 0) {
 			config->recursive_homes = B_TRUE;
 		} else if (strcmp(argv[c], "mount_recursively") == 0) {
 			config->mount_recursively = B_TRUE;
 		} else if (strcmp(argv[c], "prop_mountpoint") == 0) {
 			if (config->homedir == NULL)
 				config->homedir = strdup(entry->pw_dir);
 		}
 	}
 	return (PAM_SUCCESS);
 }
 
 typedef struct {
 	pam_handle_t *pamh;
 	zfs_key_config_t *target;
 } mount_umount_dataset_data_t;
 
 static int
 mount_dataset(zfs_handle_t *zhp, void *data)
 {
 	mount_umount_dataset_data_t *mount_umount_dataset_data = data;
 
 	zfs_key_config_t *target = mount_umount_dataset_data->target;
 	pam_handle_t *pamh = mount_umount_dataset_data->pamh;
 
 	/* Refresh properties to get the latest key status */
 	zfs_refresh_properties(zhp);
 
 	int ret = 0;
 
 	/* Check if dataset type is filesystem */
 	if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) {
 		pam_syslog(pamh, LOG_DEBUG,
 		    "dataset is not filesystem: %s, skipping.",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Check if encryption key is available */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) ==
 	    ZFS_KEYSTATUS_UNAVAILABLE) {
 		pam_syslog(pamh, LOG_WARNING,
 		    "key unavailable for: %s, skipping",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Check if prop canmount is on */
 	if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) != ZFS_CANMOUNT_ON) {
 		pam_syslog(pamh, LOG_INFO,
 		    "canmount is not on for: %s, skipping",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Get mountpoint prop for check */
 	char mountpoint[ZFS_MAXPROPLEN];
 	if ((ret = zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, 1)) != 0) {
 		pam_syslog(pamh, LOG_ERR,
 		    "failed to get mountpoint prop: %d", ret);
 		return (-1);
 	}
 
 	/* Check if mountpoint isn't none or legacy */
 	if (strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) == 0 ||
 	    strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) == 0) {
 		pam_syslog(pamh, LOG_INFO,
 		    "mountpoint is none or legacy for: %s, skipping",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Don't mount the dataset if already mounted */
 	if (zfs_is_mounted(zhp, NULL)) {
 		pam_syslog(pamh, LOG_INFO, "already mounted: %s",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Mount the dataset */
 	ret = zfs_mount(zhp, NULL, 0);
 	if (ret) {
 		pam_syslog(pamh, LOG_ERR,
 		    "zfs_mount failed for %s with: %d", zfs_get_name(zhp),
 		    ret);
 		return (ret);
 	}
 
 	/* Recursively mount children if the recursive flag is set */
 	if (target->mount_recursively) {
 		ret = zfs_iter_filesystems_v2(zhp, 0, mount_dataset, data);
 		if (ret != 0) {
 			pam_syslog(pamh, LOG_ERR,
 			    "child iteration failed: %d", ret);
 			return (-1);
 		}
 	}
 
 	return (ret);
 }
 
 static int
 umount_dataset(zfs_handle_t *zhp, void *data)
 {
 	mount_umount_dataset_data_t *mount_umount_dataset_data = data;
 
 	zfs_key_config_t *target = mount_umount_dataset_data->target;
 	pam_handle_t *pamh = mount_umount_dataset_data->pamh;
 
 	int ret = 0;
 	/* Recursively umount children if the recursive flag is set */
 	if (target->mount_recursively) {
 		ret = zfs_iter_filesystems_v2(zhp, 0, umount_dataset, data);
 		if (ret != 0) {
 			pam_syslog(pamh, LOG_ERR,
 			    "child iteration failed: %d", ret);
 			return (-1);
 		}
 	}
 
 	/* Check if dataset type is filesystem */
 	if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) {
 		pam_syslog(pamh, LOG_DEBUG,
 		    "dataset is not filesystem: %s, skipping",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Don't umount the dataset if already unmounted */
 	if (zfs_is_mounted(zhp, NULL) == 0) {
 		pam_syslog(pamh, LOG_INFO, "already unmounted: %s",
 		    zfs_get_name(zhp));
 		return (0);
 	}
 
 	/* Unmount the dataset */
 	ret = zfs_unmount(zhp, NULL, target->force_unmount ? MS_FORCE : 0);
 	if (ret) {
 		pam_syslog(pamh, LOG_ERR,
 		    "zfs_unmount failed for %s with: %d", zfs_get_name(zhp),
 		    ret);
 		return (ret);
 	}
 
 	return (ret);
 }
 
 static int
 decrypt_mount(pam_handle_t *pamh, zfs_key_config_t *config, const char *ds_name,
 	const char *passphrase, boolean_t noop)
 {
 	zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM);
 	if (ds == NULL) {
 		pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name);
 		return (-1);
 	}
 	pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, NULL);
 	if (key == NULL) {
 		zfs_close(ds);
 		return (-1);
 	}
 	int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value,
 	    WRAPPING_KEY_LEN);
 	pw_free(key);
 	if (ret && ret != EEXIST) {
 		pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret);
 		zfs_close(ds);
 		return (-1);
 	}
 
 	if (noop) {
 		zfs_close(ds);
 		return (0);
 	}
 
 	mount_umount_dataset_data_t data;
 	data.pamh = pamh;
 	data.target = config;
 
 	ret = mount_dataset(ds, &data);
 	if (ret != 0) {
 		pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret);
 		zfs_close(ds);
 		return (-1);
 	}
 
 	zfs_close(ds);
 	return (0);
 }
 
 static int
 unmount_unload(pam_handle_t *pamh, const char *ds_name,
     zfs_key_config_t *target)
 {
 	zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM);
 	if (ds == NULL) {
 		pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name);
 		return (-1);
 	}
 
 	mount_umount_dataset_data_t data;
 	data.pamh = pamh;
 	data.target = target;
 
 	int ret = umount_dataset(ds, &data);
 	if (ret) {
 		pam_syslog(pamh, LOG_ERR,
 		    "unmount_dataset failed with: %d", ret);
 		zfs_close(ds);
 		return (-1);
 	}
 
 	ret = lzc_unload_key(ds_name);
 	if (ret) {
 		pam_syslog(pamh, LOG_ERR, "unload_key failed with: %d", ret);
 		zfs_close(ds);
 		return (-1);
 	}
 	zfs_close(ds);
 	return (0);
 }
 
 static void
 zfs_key_config_free(zfs_key_config_t *config)
 {
 	free(config->homes_prefix);
 	free(config->runstatedir);
 	free(config->homedir);
 	free(config->dsname);
 }
 
 static int
 find_dsname_by_prop_value(zfs_handle_t *zhp, void *data)
 {
 	zfs_type_t type = zfs_get_type(zhp);
 	zfs_key_config_t *target = data;
 	char mountpoint[ZFS_MAXPROPLEN];
 
 	/* Skip any datasets whose type does not match */
 	if ((type & ZFS_TYPE_FILESYSTEM) == 0) {
 		zfs_close(zhp);
 		return (0);
 	}
 
 	/* Skip any datasets whose mountpoint does not match */
 	(void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint,
 	    sizeof (mountpoint), NULL, NULL, 0, B_FALSE);
 	if (strcmp(target->homedir, mountpoint) != 0) {
 		if (target->recursive_homes) {
 			(void) zfs_iter_filesystems_v2(zhp, 0,
 			    find_dsname_by_prop_value, target);
 		}
 		zfs_close(zhp);
 		return (target->dsname != NULL);
 	}
 
 	target->dsname = strdup(zfs_get_name(zhp));
 	zfs_close(zhp);
 	return (1);
 }
 
 static char *
 zfs_key_config_get_dataset(pam_handle_t *pamh, zfs_key_config_t *config)
 {
 	if (config->homedir != NULL &&
 	    config->homes_prefix != NULL) {
 		if (strcmp(config->homes_prefix, "*") == 0) {
 			(void) zfs_iter_root(g_zfs,
 			    find_dsname_by_prop_value, config);
 		} else {
 			zfs_handle_t *zhp = zfs_open(g_zfs,
 			    config->homes_prefix, ZFS_TYPE_FILESYSTEM);
 			if (zhp == NULL) {
 				pam_syslog(pamh, LOG_ERR,
 				    "dataset %s not found",
 				    config->homes_prefix);
 				return (NULL);
 			}
 
 			(void) zfs_iter_filesystems_v2(zhp, 0,
 			    find_dsname_by_prop_value, config);
 			zfs_close(zhp);
 		}
 		char *dsname = config->dsname;
 		config->dsname = NULL;
 		return (dsname);
 	}
 
 	if (config->homes_prefix == NULL) {
 		return (NULL);
 	}
 
 	size_t len = ZFS_MAX_DATASET_NAME_LEN;
 	size_t total_len = strlen(config->homes_prefix) + 1
 	    + strlen(config->username);
 	if (total_len > len) {
 		return (NULL);
 	}
 	char *ret = malloc(len + 1);
 	if (!ret) {
 		return (NULL);
 	}
 	ret[0] = 0;
 	(void) snprintf(ret, len + 1, "%s/%s", config->homes_prefix,
 	    config->username);
 	return (ret);
 }
 
 static int
 zfs_key_config_modify_session_counter(pam_handle_t *pamh,
     zfs_key_config_t *config, int delta)
 {
 	const char *runtime_path = config->runstatedir;
 	if (mkdir(runtime_path, S_IRWXU) != 0 && errno != EEXIST) {
 		pam_syslog(pamh, LOG_ERR, "Can't create runtime path: %d",
 		    errno);
 		return (-1);
 	}
 	if (chown(runtime_path, 0, 0) != 0) {
 		pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d",
 		    errno);
 		return (-1);
 	}
 	if (chmod(runtime_path, S_IRWXU) != 0) {
 		pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d",
 		    errno);
 		return (-1);
 	}
 
 	char *counter_path;
 	if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1)
 		return (-1);
 
 	const int fd = open(counter_path,
 	    O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW,
 	    S_IRUSR | S_IWUSR);
 	free(counter_path);
 	if (fd < 0) {
 		pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno);
 		return (-1);
 	}
 	if (flock(fd, LOCK_EX) != 0) {
 		pam_syslog(pamh, LOG_ERR, "Can't lock counter file: %d", errno);
 		close(fd);
 		return (-1);
 	}
 	char counter[20];
 	char *pos = counter;
 	int remaining = sizeof (counter) - 1;
 	int ret;
 	counter[sizeof (counter) - 1] = 0;
 	while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) {
 		remaining -= ret;
 		pos += ret;
 	}
 	*pos = 0;
 	long int counter_value = strtol(counter, NULL, 10);
 	counter_value += delta;
 	if (counter_value < 0) {
 		counter_value = 0;
 	}
 	lseek(fd, 0, SEEK_SET);
 	if (ftruncate(fd, 0) != 0) {
 		pam_syslog(pamh, LOG_ERR, "Can't truncate counter file: %d",
 		    errno);
 		close(fd);
 		return (-1);
 	}
 	snprintf(counter, sizeof (counter), "%ld", counter_value);
 	remaining = strlen(counter);
 	pos = counter;
 	while (remaining > 0 && (ret = write(fd, pos, remaining)) > 0) {
 		remaining -= ret;
 		pos += ret;
 	}
 	close(fd);
 	return (counter_value);
 }
 
 __attribute__((visibility("default")))
 PAM_EXTERN int
 pam_sm_authenticate(pam_handle_t *pamh, int flags,
     int argc, const char **argv)
 {
 	(void) flags;
 
 	if (geteuid() != 0) {
 		pam_syslog(pamh, LOG_ERR,
 		    "Cannot zfs_mount when not being root.");
 		return (PAM_SERVICE_ERR);
 	}
 	zfs_key_config_t config;
 	int config_err = zfs_key_config_load(pamh, &config, argc, argv);
 	if (config_err != PAM_SUCCESS) {
 		return (config_err);
 	}
 	if (config.uid < config.uid_min || config.uid > config.uid_max) {
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 
 	const pw_password_t *token = pw_fetch_lazy(pamh,
 	    PAM_AUTHTOK, PASSWORD_VAR_NAME);
 	if (token == NULL) {
 		zfs_key_config_free(&config);
 		return (PAM_AUTH_ERR);
 	}
 	if (pam_zfs_init(pamh) != 0) {
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 	char *dataset = zfs_key_config_get_dataset(pamh, &config);
 	if (!dataset) {
 		pam_zfs_free();
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 	if (decrypt_mount(pamh, &config, dataset, token->value, B_TRUE) == -1) {
 		free(dataset);
 		pam_zfs_free();
 		zfs_key_config_free(&config);
 		return (PAM_AUTH_ERR);
 	}
 	free(dataset);
 	pam_zfs_free();
 	zfs_key_config_free(&config);
 	return (PAM_SUCCESS);
 }
 
 __attribute__((visibility("default")))
 PAM_EXTERN int
 pam_sm_setcred(pam_handle_t *pamh, int flags,
     int argc, const char **argv)
 {
 	(void) pamh, (void) flags, (void) argc, (void) argv;
 	return (PAM_SUCCESS);
 }
 
 __attribute__((visibility("default")))
 PAM_EXTERN int
 pam_sm_chauthtok(pam_handle_t *pamh, int flags,
     int argc, const char **argv)
 {
 	if (geteuid() != 0) {
 		pam_syslog(pamh, LOG_ERR,
 		    "Cannot zfs_mount when not being root.");
 		return (PAM_PERM_DENIED);
 	}
 	zfs_key_config_t config;
 	if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
 		return (PAM_SERVICE_ERR);
 	}
 	if (config.uid < config.uid_min || config.uid > config.uid_max) {
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 	const pw_password_t *old_token = pw_get(pamh,
 	    PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME);
 	{
 		if (pam_zfs_init(pamh) != 0) {
 			zfs_key_config_free(&config);
 			return (PAM_SERVICE_ERR);
 		}
 		char *dataset = zfs_key_config_get_dataset(pamh, &config);
 		if (!dataset) {
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			return (PAM_SERVICE_ERR);
 		}
 		if (!old_token) {
 			pam_syslog(pamh, LOG_ERR,
 			    "old password from PAM stack is null");
 			free(dataset);
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			return (PAM_SERVICE_ERR);
 		}
 		if (decrypt_mount(pamh, &config, dataset,
 		    old_token->value, B_TRUE) == -1) {
 			pam_syslog(pamh, LOG_ERR,
 			    "old token mismatch");
 			free(dataset);
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			return (PAM_PERM_DENIED);
 		}
 	}
 
 	if ((flags & PAM_UPDATE_AUTHTOK) != 0) {
 		const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK,
 		    PASSWORD_VAR_NAME);
 		if (token == NULL) {
 			pam_syslog(pamh, LOG_ERR, "new password unavailable");
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
 			return (PAM_SERVICE_ERR);
 		}
 		char *dataset = zfs_key_config_get_dataset(pamh, &config);
 		if (!dataset) {
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
 			pw_clear(pamh, PASSWORD_VAR_NAME);
 			return (PAM_SERVICE_ERR);
 		}
 		int was_loaded = is_key_loaded(pamh, dataset);
 		if (!was_loaded && decrypt_mount(pamh, &config, dataset,
 		    old_token->value, B_FALSE) == -1) {
 			free(dataset);
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			pw_clear(pamh, OLD_PASSWORD_VAR_NAME);
 			pw_clear(pamh, PASSWORD_VAR_NAME);
 			return (PAM_SERVICE_ERR);
 		}
 		int changed = change_key(pamh, dataset, token->value);
 		if (!was_loaded) {
 			unmount_unload(pamh, dataset, &config);
 		}
 		free(dataset);
 		pam_zfs_free();
 		zfs_key_config_free(&config);
 		if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 ||
 		    pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) {
 			return (PAM_SERVICE_ERR);
 		}
 	} else {
 		zfs_key_config_free(&config);
 	}
 	return (PAM_SUCCESS);
 }
 
 PAM_EXTERN int
 pam_sm_open_session(pam_handle_t *pamh, int flags,
     int argc, const char **argv)
 {
 	(void) flags;
 
 	if (geteuid() != 0) {
 		pam_syslog(pamh, LOG_ERR,
 		    "Cannot zfs_mount when not being root.");
 		return (PAM_SUCCESS);
 	}
 	zfs_key_config_t config;
 	if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
 		return (PAM_SESSION_ERR);
 	}
 
 	if (config.uid < config.uid_min || config.uid > config.uid_max) {
 		zfs_key_config_free(&config);
 		return (PAM_SUCCESS);
 	}
 
 	int counter = zfs_key_config_modify_session_counter(pamh, &config, 1);
 	if (counter != 1) {
 		zfs_key_config_free(&config);
 		return (PAM_SUCCESS);
 	}
 
 	const pw_password_t *token = pw_get(pamh,
 	    PAM_AUTHTOK, PASSWORD_VAR_NAME);
 	if (token == NULL) {
 		zfs_key_config_free(&config);
 		return (PAM_SESSION_ERR);
 	}
 	if (pam_zfs_init(pamh) != 0) {
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 	char *dataset = zfs_key_config_get_dataset(pamh, &config);
 	if (!dataset) {
 		pam_zfs_free();
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 	if (decrypt_mount(pamh, &config, dataset,
 	    token->value, B_FALSE) == -1) {
 		free(dataset);
 		pam_zfs_free();
 		zfs_key_config_free(&config);
 		return (PAM_SERVICE_ERR);
 	}
 	free(dataset);
 	pam_zfs_free();
 	zfs_key_config_free(&config);
 	if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) {
 		return (PAM_SERVICE_ERR);
 	}
 	return (PAM_SUCCESS);
 
 }
 
 __attribute__((visibility("default")))
 PAM_EXTERN int
 pam_sm_close_session(pam_handle_t *pamh, int flags,
     int argc, const char **argv)
 {
 	(void) flags;
 
 	if (geteuid() != 0) {
 		pam_syslog(pamh, LOG_ERR,
 		    "Cannot zfs_mount when not being root.");
 		return (PAM_SUCCESS);
 	}
 	zfs_key_config_t config;
 	if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) {
 		return (PAM_SESSION_ERR);
 	}
 	if (config.uid < config.uid_min || config.uid > config.uid_max) {
 		zfs_key_config_free(&config);
 		return (PAM_SUCCESS);
 	}
 
 	int counter = zfs_key_config_modify_session_counter(pamh, &config, -1);
 	if (counter != 0) {
 		zfs_key_config_free(&config);
 		return (PAM_SUCCESS);
 	}
 
 	if (config.unmount_and_unload) {
 		if (pam_zfs_init(pamh) != 0) {
 			zfs_key_config_free(&config);
 			return (PAM_SERVICE_ERR);
 		}
 		char *dataset = zfs_key_config_get_dataset(pamh, &config);
 		if (!dataset) {
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			return (PAM_SESSION_ERR);
 		}
 		if (unmount_unload(pamh, dataset, &config) == -1) {
 			free(dataset);
 			pam_zfs_free();
 			zfs_key_config_free(&config);
 			return (PAM_SESSION_ERR);
 		}
 		free(dataset);
 		pam_zfs_free();
 	}
 
 	zfs_key_config_free(&config);
 	return (PAM_SUCCESS);
 }
diff --git a/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h b/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h
index 16e8a319a5f8..152e5a606f0e 100644
--- a/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h
+++ b/sys/contrib/openzfs/include/os/linux/kernel/linux/dcache_compat.h
@@ -1,106 +1,80 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
  */
 
 #ifndef _ZFS_DCACHE_H
 #define	_ZFS_DCACHE_H
 
 #include <linux/dcache.h>
 
 #define	dname(dentry)	((char *)((dentry)->d_name.name))
 #define	dlen(dentry)	((int)((dentry)->d_name.len))
 
 #define	d_alias			d_u.d_alias
 
 /*
  * Starting from Linux 5.13, flush_dcache_page() becomes an inline function
  * and under some configurations, may indirectly referencing GPL-only
  * symbols, e.g., cpu_feature_keys on powerpc and PageHuge on riscv.
  * Override this function when it is detected being GPL-only.
  */
 #if defined __powerpc__ && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
 #include <linux/simd_powerpc.h>
 #define	flush_dcache_page(page)	do {					\
 		if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE) &&	\
 		    test_bit(PG_dcache_clean, &(page)->flags))		\
 			clear_bit(PG_dcache_clean, &(page)->flags);	\
 	} while (0)
 #endif
 /*
  * For riscv implementation, the use of PageHuge can be safely removed.
  * Because it handles pages allocated by HugeTLB, while flush_dcache_page
  * in zfs module is only called on kernel pages.
  */
 #if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY
 #define	flush_dcache_page(page)	do {					\
 		if (test_bit(PG_dcache_clean, &(page)->flags))		\
 			clear_bit(PG_dcache_clean, &(page)->flags);	\
 	} while (0)
 #endif
 
-/*
- * 2.6.30 API change,
- * The const keyword was added to the 'struct dentry_operations' in
- * the dentry structure.  To handle this we define an appropriate
- * dentry_operations_t typedef which can be used.
- */
-typedef const struct dentry_operations	dentry_operations_t;
-
-/*
- * 2.6.38 API addition,
- * Added d_clear_d_op() helper function which clears some flags and the
- * registered dentry->d_op table.  This is required because d_set_d_op()
- * issues a warning when the dentry operations table is already set.
- * For the .zfs control directory to work properly we must be able to
- * override the default operations table and register custom .d_automount
- * and .d_revalidate callbacks.
- */
-static inline void
-d_clear_d_op(struct dentry *dentry)
-{
-	dentry->d_op = NULL;
-	dentry->d_flags &= ~(
-	    DCACHE_OP_HASH | DCACHE_OP_COMPARE |
-	    DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE);
-}
-
 /*
  * Walk and invalidate all dentry aliases of an inode
  * unless it's a mountpoint
  */
 static inline void
 zpl_d_drop_aliases(struct inode *inode)
 {
 	struct dentry *dentry;
 	spin_lock(&inode->i_lock);
 	hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
 		if (!IS_ROOT(dentry) && !d_mountpoint(dentry) &&
 		    (dentry->d_inode == inode)) {
 			d_drop(dentry);
 		}
 	}
 	spin_unlock(&inode->i_lock);
 }
 #endif /* _ZFS_DCACHE_H */
diff --git a/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h b/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h
index 087389b57b34..ad2815e46394 100644
--- a/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h
+++ b/sys/contrib/openzfs/include/os/linux/spl/sys/stat.h
@@ -1,30 +1,30 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _SPL_STAT_H
 #define	_SPL_STAT_H
 
-#include <linux/stat.h>
+#include <sys/stat.h>
 
 #endif /* SPL_STAT_H */
diff --git a/sys/contrib/openzfs/include/sys/zio.h b/sys/contrib/openzfs/include/sys/zio.h
index 353805fcb969..a8acb83b4c2f 100644
--- a/sys/contrib/openzfs/include/sys/zio.h
+++ b/sys/contrib/openzfs/include/sys/zio.h
@@ -1,757 +1,758 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, 2023, 2024, Klara Inc.
  * Copyright (c) 2019-2020, Michael Niewöhner
  * Copyright (c) 2024 by George Melikov. All rights reserved.
  */
 
 #ifndef _ZIO_H
 #define	_ZIO_H
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_impl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Embedded checksum
  */
 #define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
 typedef struct zio_eck {
 	uint64_t	zec_magic;	/* for validation, endianness	*/
 	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
 } zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
  * of block pointers. The old gang block size has enough room for 3 blkptrs,
  * while new gang blocks can store more.
  *
  * Layout:
  * +--------+--------+--------+-----+---------+-----------+
  * |        |        |        |     |         |           |
  * | blkptr | blkptr | blkptr | ... | padding | zio_eck_t |
  * |   1    |   2    |   3    |     |         |           |
  * +--------+--------+--------+-----+---------+-----------+
  *   128B     128B     128B             88B        40B
  */
 #define	SPA_OLD_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 typedef void zio_gbh_phys_t;
 
 static inline uint64_t
 gbh_nblkptrs(uint64_t size) {
 	ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
 	return ((size - sizeof (zio_eck_t)) / sizeof (blkptr_t));
 }
 
 static inline zio_eck_t *
 gbh_eck(zio_gbh_phys_t *gbh, uint64_t size) {
 	ASSERT(IS_P2ALIGNED(size, sizeof (blkptr_t)));
-	return ((zio_eck_t *)((uintptr_t)gbh + (size_t)size - sizeof (zio_eck_t)));
+	return ((zio_eck_t *)((uintptr_t)gbh + (size_t)size -
+	    sizeof (zio_eck_t)));
 }
 
 static inline blkptr_t *
 gbh_bp(zio_gbh_phys_t *gbh, int bp) {
 	return (&((blkptr_t *)gbh)[bp]);
 }
 
 enum zio_checksum {
 	ZIO_CHECKSUM_INHERIT = 0,
 	ZIO_CHECKSUM_ON,
 	ZIO_CHECKSUM_OFF,
 	ZIO_CHECKSUM_LABEL,
 	ZIO_CHECKSUM_GANG_HEADER,
 	ZIO_CHECKSUM_ZILOG,
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_NOPARITY,
 	ZIO_CHECKSUM_SHA512,
 	ZIO_CHECKSUM_SKEIN,
 	ZIO_CHECKSUM_EDONR,
 	ZIO_CHECKSUM_BLAKE3,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
 
 #define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
 #define	ZIO_CHECKSUM_MASK	0xffULL
 #define	ZIO_CHECKSUM_VERIFY	(1U << 8)
 
 #define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
 
 /* macros defining encryption lengths */
 #define	ZIO_OBJSET_MAC_LEN		32
 #define	ZIO_DATA_IV_LEN			12
 #define	ZIO_DATA_SALT_LEN		8
 #define	ZIO_DATA_MAC_LEN		16
 
 /*
  * The number of "legacy" compression functions which can be set on individual
  * objects.
  */
 #define	ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
 
 /*
  * The meaning of "compress = on" selected by the compression features enabled
  * on a given pool.
  */
 #define	ZIO_COMPRESS_LEGACY_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_LZ4_ON_VALUE	ZIO_COMPRESS_LZ4
 
 #define	ZIO_COMPRESS_DEFAULT		ZIO_COMPRESS_ON
 
 #define	BOOTFS_COMPRESS_VALID(compress)			\
 	((compress) == ZIO_COMPRESS_LZJB ||		\
 	(compress) == ZIO_COMPRESS_LZ4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_1 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_2 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_3 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_4 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_5 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_6 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_7 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_8 ||		\
 	(compress) == ZIO_COMPRESS_GZIP_9 ||		\
 	(compress) == ZIO_COMPRESS_ZLE ||		\
 	(compress) == ZIO_COMPRESS_ZSTD ||		\
 	(compress) == ZIO_COMPRESS_ON ||		\
 	(compress) == ZIO_COMPRESS_OFF)
 
 
 #define	ZIO_COMPRESS_ALGO(x)	(x & SPA_COMPRESSMASK)
 #define	ZIO_COMPRESS_LEVEL(x)	((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS)
 #define	ZIO_COMPRESS_RAW(type, level)	(type | ((level) << SPA_COMPRESSBITS))
 
 #define	ZIO_COMPLEVEL_ZSTD(level)	\
 	ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level)
 
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
 
 typedef enum zio_suspend_reason {
 	ZIO_SUSPEND_NONE = 0,
 	ZIO_SUSPEND_IOERR,
 	ZIO_SUSPEND_MMP,
 } zio_suspend_reason_t;
 
 /*
  * This was originally an enum type. However, those are 32-bit and there is no
  * way to make a 64-bit enum type. Since we ran out of bits for flags, we were
  * forced to upgrade it to a uint64_t.
  *
  * NOTE: PLEASE UPDATE THE BITFIELD STRINGS IN zfs_valstr.c IF YOU ADD ANOTHER
  * FLAG.
  */
 typedef uint64_t zio_flag_t;
 	/*
 	 * Flags inherited by gang, ddt, and vdev children,
 	 * and that must be equal for two zios to aggregate
 	 */
 #define	ZIO_FLAG_DONT_AGGREGATE	(1ULL << 0)
 #define	ZIO_FLAG_IO_REPAIR	(1ULL << 1)
 #define	ZIO_FLAG_SELF_HEAL	(1ULL << 2)
 #define	ZIO_FLAG_RESILVER	(1ULL << 3)
 #define	ZIO_FLAG_SCRUB		(1ULL << 4)
 #define	ZIO_FLAG_SCAN_THREAD	(1ULL << 5)
 #define	ZIO_FLAG_PHYSICAL	(1ULL << 6)
 
 #define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
 
 	/*
 	 * Flags inherited by ddt, gang, and vdev children.
 	 */
 #define	ZIO_FLAG_CANFAIL	(1ULL << 7)	/* must be first for INHERIT */
 #define	ZIO_FLAG_SPECULATIVE	(1ULL << 8)
 #define	ZIO_FLAG_CONFIG_WRITER	(1ULL << 9)
 #define	ZIO_FLAG_DONT_RETRY	(1ULL << 10)
 #define	ZIO_FLAG_NODATA		(1ULL << 12)
 #define	ZIO_FLAG_INDUCE_DAMAGE	(1ULL << 13)
 #define	ZIO_FLAG_ALLOC_THROTTLED	(1ULL << 14)
 
 #define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 #define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
 
 	/*
 	 * Flags inherited by vdev children.
 	 */
 #define	ZIO_FLAG_IO_RETRY	(1ULL << 15)	/* must be first for INHERIT */
 #define	ZIO_FLAG_PROBE		(1ULL << 16)
 #define	ZIO_FLAG_TRYHARD	(1ULL << 17)
 #define	ZIO_FLAG_OPTIONAL	(1ULL << 18)
 #define	ZIO_FLAG_DIO_READ	(1ULL << 19)
 #define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
 
 	/*
 	 * Flags not inherited by any children.
 	 */
 #define	ZIO_FLAG_DONT_QUEUE	(1ULL << 20)	/* must be first for INHERIT */
 #define	ZIO_FLAG_DONT_PROPAGATE	(1ULL << 21)
 #define	ZIO_FLAG_IO_BYPASS	(1ULL << 22)
 #define	ZIO_FLAG_IO_REWRITE	(1ULL << 23)
 #define	ZIO_FLAG_RAW_COMPRESS	(1ULL << 24)
 #define	ZIO_FLAG_RAW_ENCRYPT	(1ULL << 25)
 #define	ZIO_FLAG_GANG_CHILD	(1ULL << 26)
 #define	ZIO_FLAG_DDT_CHILD	(1ULL << 27)
 #define	ZIO_FLAG_GODFATHER	(1ULL << 28)
 #define	ZIO_FLAG_NOPWRITE	(1ULL << 29)
 #define	ZIO_FLAG_REEXECUTED	(1ULL << 30)
 #define	ZIO_FLAG_DELEGATED	(1ULL << 31)
 #define	ZIO_FLAG_PREALLOCATED	(1ULL << 32)
 
 #define	ZIO_ALLOCATOR_NONE	(-1)
 #define	ZIO_HAS_ALLOCATOR(zio)	((zio)->io_allocator != ZIO_ALLOCATOR_NONE)
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
 #define	ZIO_FLAG_RAW	(ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT)
 
 #define	ZIO_DDT_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
 	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_VDEV_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
 	ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL)
 
 #define	ZIO_CHILD_BIT(x)		(1U << (x))
 #define	ZIO_CHILD_BIT_IS_SET(val, x)	((val) & (1U << (x)))
 
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
 	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
 
 #define	ZIO_CHILD_VDEV_BIT		ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
 #define	ZIO_CHILD_GANG_BIT		ZIO_CHILD_BIT(ZIO_CHILD_GANG)
 #define	ZIO_CHILD_DDT_BIT		ZIO_CHILD_BIT(ZIO_CHILD_DDT)
 #define	ZIO_CHILD_LOGICAL_BIT		ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
 #define	ZIO_CHILD_ALL_BITS					\
 	(ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT |		\
 	ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
 
 enum zio_wait_type {
 	ZIO_WAIT_READY = 0,
 	ZIO_WAIT_DONE,
 	ZIO_WAIT_TYPES
 };
 
 typedef void zio_done_func_t(zio_t *zio);
 
 extern int zio_exclude_metadata;
 extern int zio_dva_throttle_enabled;
 extern const char *const zio_type_name[ZIO_TYPES];
 
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
  * is objset 0, and the meta-dnode is object 0.  This covers all blocks
  * except root blocks and ZIL blocks, which are defined as follows:
  *
  * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
  * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
  * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
  *
  * Note: this structure is called a bookmark because its original purpose
  * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel, and is
  * stored on disk (by virtue of being incorporated into other on-disk
  * structures, e.g. dsl_scan_phys_t).
  *
  * If the head_errlog feature is enabled a different on-disk format for error
  * logs is used. This introduces the use of an error bookmark, a four-tuple
  * <object, level, blkid, birth> that uniquely identifies any error block
  * in the pool. The birth transaction group is used to track whether the block
  * has been overwritten by newer data or added to a snapshot since its marking
  * as an error.
  */
 struct zbookmark_phys {
 	uint64_t	zb_objset;
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 };
 
 struct zbookmark_err_phys {
 	uint64_t	zb_object;
 	int64_t		zb_level;
 	uint64_t	zb_blkid;
 	uint64_t	zb_birth;
 };
 
 #define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
 	(zb)->zb_objset = objset;                       \
 	(zb)->zb_object = object;                       \
 	(zb)->zb_level = level;                         \
 	(zb)->zb_blkid = blkid;                         \
 }
 
 #define	ZB_DESTROYED_OBJSET	(-1ULL)
 
 #define	ZB_ROOT_OBJECT		(0ULL)
 #define	ZB_ROOT_LEVEL		(-1LL)
 #define	ZB_ROOT_BLKID		(0ULL)
 
 #define	ZB_ZIL_OBJECT		(0ULL)
 #define	ZB_ZIL_LEVEL		(-2LL)
 
 #define	ZB_DNODE_LEVEL		(-3LL)
 #define	ZB_DNODE_BLKID		(0ULL)
 
 #define	ZB_IS_ZERO(zb)						\
 	((zb)->zb_objset == 0 && (zb)->zb_object == 0 &&	\
 	(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
 #define	ZB_IS_ROOT(zb)				\
 	((zb)->zb_object == ZB_ROOT_OBJECT &&	\
 	(zb)->zb_level == ZB_ROOT_LEVEL &&	\
 	(zb)->zb_blkid == ZB_ROOT_BLKID)
 
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum:8;
 	enum zio_compress	zp_compress:8;
 	uint8_t			zp_complevel;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
 	uint8_t			zp_gang_copies;
 	dmu_object_type_t	zp_type:8;
 	dmu_object_type_t	zp_storage_type:8;
 	boolean_t		zp_dedup:1;
 	boolean_t		zp_dedup_verify:1;
 	boolean_t		zp_nopwrite:1;
 	boolean_t		zp_brtwrite:1;
 	boolean_t		zp_encrypt:1;
 	boolean_t		zp_byteorder:1;
 	boolean_t		zp_direct_write:1;
 	boolean_t		zp_rewrite:1;
 	uint32_t		zp_zpl_smallblk;
 	uint8_t			zp_salt[ZIO_DATA_SALT_LEN];
 	uint8_t			zp_iv[ZIO_DATA_IV_LEN];
 	uint8_t			zp_mac[ZIO_DATA_MAC_LEN];
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
 
 typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
     const abd_t *good_data);
 typedef void zio_cksum_free_f(void *cbdata, size_t size);
 
 struct zio_bad_cksum;				/* defined in zio_checksum.h */
 struct dnode_phys;
 struct abd;
 
 struct zio_cksum_report {
 	struct zio_cksum_report *zcr_next;
 	nvlist_t		*zcr_ereport;
 	nvlist_t		*zcr_detector;
 	void			*zcr_cbdata;
 	size_t			zcr_cbinfo;	/* passed to zcr_free() */
 	uint64_t		zcr_sector;
 	uint64_t		zcr_align;
 	uint64_t		zcr_length;
 	zio_cksum_finish_f	*zcr_finish;
 	zio_cksum_free_f	*zcr_free;
 
 	/* internal use only */
 	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
 };
 
 typedef struct zio_vsd_ops {
 	zio_done_func_t		*vsd_free;
 } zio_vsd_ops_t;
 
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	uint64_t		gn_gangblocksize;
 	uint64_t		gn_allocsize;
 	struct zio_gang_node	*gn_child[];
 } zio_gang_node_t;
 
 typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp,
     zio_gang_node_t *gn, struct abd *data, uint64_t offset);
 
 typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size);
 
 typedef struct zio_transform {
 	struct abd		*zt_orig_abd;
 	uint64_t		zt_orig_size;
 	uint64_t		zt_bufsize;
 	zio_transform_func_t	*zt_transform;
 	struct zio_transform	*zt_next;
 } zio_transform_t;
 
 typedef zio_t *zio_pipe_stage_t(zio_t *zio);
 
 /*
  * The io_post flags describe additional actions that a parent IO should
  * consider or perform on behalf of a child. They are distinct from io_flags
  * because the child must be able to propagate them to the parent. The normal
  * io_flags are local to the zio, not protected by any lock, and not modifiable
  * by children; the reexecute flags are protected by io_lock, modifiable by
  * children, and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set.
  */
 #define	ZIO_POST_REEXECUTE	(1 << 0)
 #define	ZIO_POST_SUSPEND	(1 << 1)
 #define	ZIO_POST_DIO_CHKSUM_ERR	(1 << 2)
 
 /*
  * The io_trim flags are used to specify the type of TRIM to perform.  They
  * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags.
  */
 enum trim_flag {
 	ZIO_TRIM_SECURE		= 1U << 0,
 };
 
 typedef struct zio_alloc_list {
 	list_t  zal_list;
 	uint64_t zal_size;
 } zio_alloc_list_t;
 
 typedef struct zio_link {
 	zio_t		*zl_parent;
 	zio_t		*zl_child;
 	list_node_t	zl_parent_node;
 	list_node_t	zl_child_node;
 } zio_link_t;
 
 enum zio_qstate {
 	ZIO_QS_NONE = 0,
 	ZIO_QS_QUEUED,
 	ZIO_QS_ACTIVE,
 };
 
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_phys_t	io_bookmark;
 	zio_prop_t	io_prop;
 	zio_type_t	io_type;
 	enum zio_child	io_child_type;
 	enum trim_flag	io_trim_flags;
 	zio_priority_t	io_priority;
 	uint8_t		io_post;
 	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
 	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
 	list_t		io_parent_list;
 	list_t		io_child_list;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
 	/* Callback info */
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_children_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
 	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 	/* io_lsize != io_orig_size iff this is a raw write */
 	uint64_t	io_lsize;
 
 	/* Data represented by this I/O */
 	struct abd	*io_abd;
 	struct abd	*io_orig_abd;
 	uint64_t	io_size;
 	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
 	const zio_vsd_ops_t *io_vsd_ops;
 	metaslab_class_t *io_metaslab_class;	/* dva throttle class */
 
 	enum zio_qstate	io_queue_state;	/* vdev queue state */
 	union {
 		list_node_t l;
 		avl_node_t a;
 	} io_queue_node ____cacheline_aligned;	/* allocator and vdev queues */
 	avl_node_t	io_offset_node;	/* vdev offset queues */
 	uint64_t	io_offset;
 	hrtime_t	io_timestamp;	/* submitted at */
 	hrtime_t	io_queued_timestamp;
 	hrtime_t	io_target_timestamp;
 	hrtime_t	io_delta;	/* vdev queue service delta */
 	hrtime_t	io_delay;	/* Device access time (disk or */
 					/* file). */
 	zio_alloc_list_t 	io_alloc_list;
 
 	/* Internal pipeline state */
 	zio_flag_t	io_flags;
 	enum zio_stage	io_stage;
 	enum zio_stage	io_pipeline;
 	zio_flag_t	io_orig_flags;
 	enum zio_stage	io_orig_stage;
 	enum zio_stage	io_orig_pipeline;
 	enum zio_stage	io_pipeline_trace;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
 	uint64_t	*io_stall;
 	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
 	void		*io_bio;
 	kmutex_t	io_lock;
 	kcondvar_t	io_cv;
 	int		io_allocator;
 
 	/* FMA state */
 	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 
 	/* Taskq dispatching state */
 	taskq_ent_t	io_tqent;
 };
 
 enum blk_verify_flag {
 	BLK_VERIFY_ONLY,
 	BLK_VERIFY_LOG,
 	BLK_VERIFY_HALT
 };
 
 enum blk_config_flag {
 	BLK_CONFIG_HELD,   // SCL_VDEV held for writer
 	BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
 	BLK_CONFIG_NEEDED_TRY, // Try with SCL_VDEV for reader
 	BLK_CONFIG_SKIP,   // skip checks which require SCL_VDEV
 };
 
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_root(spa_t *spa,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern void zio_destroy(zio_t *zio);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *children_ready,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, const zbookmark_phys_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     struct abd *data, uint64_t size, zio_done_func_t *done, void *priv,
     zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
 
 extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
     int gang_copies, boolean_t nopwrite, boolean_t brtwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
 extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp,
     zio_done_func_t *done, void *priv, zio_flag_t flags);
 
 extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, enum trim_flag trim_flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, struct abd *data, int checksum,
     zio_done_func_t *done, void *priv, zio_priority_t priority,
     zio_flag_t flags, boolean_t labels);
 
 extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
     const blkptr_t *bp, zio_flag_t flags);
 
 extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg,
     blkptr_t *new_bp, uint64_t min_size, uint64_t max_size, boolean_t *slog,
     boolean_t allow_larger);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern size_t zio_get_compression_max_size(enum zio_compress compress,
     uint64_t gcd_alloc, uint64_t min_alloc, size_t s_len);
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(void *zio);
 extern void zio_interrupt(void *zio);
 extern void zio_delay_init(zio_t *zio);
 extern void zio_delay_interrupt(zio_t *zio);
 extern void zio_deadman(zio_t *zio, const char *tag);
 
 extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
 extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
 extern zio_t *zio_unique_parent(zio_t *cio);
 extern void zio_add_child(zio_t *pio, zio_t *cio);
 
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
 extern void zio_data_buf_free(void *buf, size_t size);
 
 extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size,
     uint64_t bufsize, zio_transform_func_t *transform);
 extern void zio_pop_transforms(zio_t *zio);
 
 extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, struct abd *data, uint64_t size, int type,
     zio_priority_t priority, zio_flag_t flags,
     zio_done_func_t *done, void *priv);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority,
     zio_flag_t flags, zio_done_func_t *done, void *priv);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
 extern void zio_vdev_io_redone(zio_t *zio);
 
 extern void zio_change_priority(zio_t *pio, zio_priority_t priority);
 
 extern void zio_checksum_verified(zio_t *zio);
 extern void zio_dio_chksum_verify_error_report(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
 extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
     enum zio_checksum parent);
 extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
     enum zio_checksum child, enum zio_checksum parent);
 extern enum zio_compress zio_compress_select(spa_t *spa,
     enum zio_compress child, enum zio_compress parent);
 extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress,
     uint8_t child, uint8_t parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t);
 extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 extern int zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
     enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
 
 /*
  * Initial setup and teardown.
  */
 extern void zio_init(void);
 extern void zio_fini(void);
 
 /*
  * Fault injection
  */
 struct zinject_record;
 extern uint32_t zio_injection_enabled;
 extern int zio_inject_fault(char *name, int flags, int *id,
     struct zinject_record *record);
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
 extern void zio_handle_panic_injection(spa_t *spa, const char *tag,
     uint64_t type);
 extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb,
     uint64_t type, int error);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
 extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1,
     int err2);
 extern int zio_handle_label_injection(zio_t *zio, int error);
 extern void zio_handle_ignored_writes(zio_t *zio);
 extern hrtime_t zio_handle_io_delay(zio_t *zio);
 extern void zio_handle_import_delay(spa_t *spa, hrtime_t elapsed);
 extern void zio_handle_export_delay(spa_t *spa, hrtime_t elapsed);
 
 /*
  * Checksum ereport functions
  */
 extern int zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, struct zio_bad_cksum *info);
 extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
     const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical);
 
 extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
 
 /* If we have the good data in hand, this function can be used */
 extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset,
     uint64_t length, const abd_t *good_data, const abd_t *bad_data,
     struct zio_bad_cksum *info);
 
 void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr);
 extern void zfs_ereport_snapshot_post(const char *subclass, spa_t *spa,
     const char *name);
 
 /* Called from spa_sync(), but primarily an injection handler */
 extern void spa_handle_ignored_writes(spa_t *spa);
 
 /* zbookmark_phys functions */
 boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 boolean_t zbookmark_subtree_tbd(const struct dnode_phys *dnp,
     const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
 int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
     uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _ZIO_H */
diff --git a/sys/contrib/openzfs/include/sys/zvol.h b/sys/contrib/openzfs/include/sys/zvol.h
index cdc9dba2a28d..5791246e99e4 100644
--- a/sys/contrib/openzfs/include/sys/zvol.h
+++ b/sys/contrib/openzfs/include/sys/zvol.h
@@ -1,65 +1,65 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  */
 
 #ifndef	_SYS_ZVOL_H
 #define	_SYS_ZVOL_H
 
 #include <sys/zfs_context.h>
 
 #define	ZVOL_OBJ		1ULL
 #define	ZVOL_ZAP_OBJ		2ULL
 
 #define	SPEC_MAXOFFSET_T	((1LL << ((NBBY * sizeof (daddr32_t)) + \
 				DEV_BSHIFT - 1)) - 1)
 
 extern void zvol_create_minors(const char *);
 extern void zvol_remove_minors(spa_t *, const char *, boolean_t);
 extern void zvol_rename_minors(spa_t *, const char *, const char *, boolean_t);
 
 #ifdef _KERNEL
 struct zvol_state;
 typedef struct zvol_state zvol_state_handle_t;
 
 extern int zvol_check_volsize(uint64_t, uint64_t);
 extern int zvol_check_volblocksize(const char *, uint64_t);
 extern int zvol_get_stats(objset_t *, nvlist_t *);
 extern boolean_t zvol_is_zvol(const char *);
 extern void zvol_create_cb(objset_t *, void *, cred_t *, dmu_tx_t *);
 extern int zvol_set_volsize(const char *, uint64_t);
 extern int zvol_set_volthreading(const char *, boolean_t);
 extern int zvol_set_common(const char *, zfs_prop_t, zprop_source_t, uint64_t);
 extern int zvol_set_ro(const char *, boolean_t);
-extern zvol_state_handle_t *zvol_suspend(const char *);
+extern int zvol_suspend(const char *, zvol_state_handle_t **);
 extern int zvol_resume(zvol_state_handle_t *);
 extern void *zvol_tag(zvol_state_handle_t *);
 
 extern int zvol_init(void);
 extern void zvol_fini(void);
 extern int zvol_busy(void);
 
 #endif /* _KERNEL */
 #endif /* _SYS_ZVOL_H */
diff --git a/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h b/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h
index a605af962a6d..13cc0b46ac93 100644
--- a/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h
+++ b/sys/contrib/openzfs/lib/libspl/include/os/linux/sys/stat.h
@@ -1,56 +1,56 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License, Version 1.0 only
  * (the "License").  You may not use this file except in compliance
  * with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 #ifndef _LIBSPL_SYS_STAT_H
 #define	_LIBSPL_SYS_STAT_H
 
 #include_next <sys/stat.h>
 
 #include <sys/mount.h> /* for BLKGETSIZE64 */
 
 #ifdef HAVE_STATX
 #include <fcntl.h>
-#include <linux/stat.h>
+#include <sys/stat.h>
 #endif
 
 /*
  * Emulate Solaris' behavior of returning the block device size in fstat64().
  */
 static inline int
 fstat64_blk(int fd, struct stat64 *st)
 {
 	if (fstat64(fd, st) == -1)
 		return (-1);
 
 	/* In Linux we need to use an ioctl to get the size of a block device */
 	if (S_ISBLK(st->st_mode)) {
 		if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0)
 			return (-1);
 	}
 
 	return (0);
 }
 #endif /* _LIBSPL_SYS_STAT_H */
diff --git a/sys/contrib/openzfs/man/man1/arcstat.1 b/sys/contrib/openzfs/man/man1/arcstat.1
index f2474fbb701f..288b98d57a11 100644
--- a/sys/contrib/openzfs/man/man1/arcstat.1
+++ b/sys/contrib/openzfs/man/man1/arcstat.1
@@ -1,427 +1,429 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" This file and its contents are supplied under the terms of the
 .\" Common Development and Distribution License ("CDDL"), version 1.0.
 .\" You may only use this file in accordance with the terms of version
 .\" 1.0 of the CDDL.
 .\"
 .\" A full copy of the text of the CDDL should have accompanied this
 .\" source.  A copy of the CDDL is also available via the Internet at
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\" Copyright 2014 Adam Stevko.  All rights reserved.
 .\" Copyright (c) 2015 by Delphix. All rights reserved.
 .\" Copyright (c) 2020 by AJ Jordan. All rights reserved.
 .\"
-.Dd December 23, 2022
+.Dd September 19, 2024
 .Dt ARCSTAT 1
 .Os
 .
 .Sh NAME
 .Nm arcstat
 .Nd report ZFS ARC and L2ARC statistics
+.Sh NOTICE
+It will be renamed to zarcstat in zfs 2.4.0. Please migrate ASAP.
 .Sh SYNOPSIS
 .Nm
 .Op Fl havxp
 .Op Fl f Ar field Ns Op , Ns Ar field Ns …
 .Op Fl o Ar file
 .Op Fl s Ar string
 .Op Ar interval
 .Op Ar count
 .
 .Sh DESCRIPTION
 .Nm
 prints various ZFS ARC and L2ARC statistics in vmstat-like fashion:
 .Bl -tag -compact -offset Ds -width "l2asize"
 .It Sy c
 ARC target size
 .It Sy dh%
 Demand hit percentage
 .It Sy di%
 Demand I/O hit percentage
 .It Sy dm%
 Demand miss percentage
 .It Sy ddh%
 Demand data hit percentage
 .It Sy ddi%
 Demand data I/O hit percentage
 .It Sy ddm%
 Demand data miss percentage
 .It Sy dmh%
 Demand metadata hit percentage
 .It Sy dmi%
 Demand metadata I/O hit percentage
 .It Sy dmm%
 Demand metadata miss percentage
 .It Sy mfu
 MFU list hits per second
 .It Sy mh%
 Metadata hit percentage
 .It Sy mi%
 Metadata I/O hit percentage
 .It Sy mm%
 Metadata miss percentage
 .It Sy mru
 MRU list hits per second
 .It Sy ph%
 Prefetch hits percentage
 .It Sy pi%
 Prefetch I/O hits percentage
 .It Sy pm%
 Prefetch miss percentage
 .It Sy pdh%
 Prefetch data hits percentage
 .It Sy pdi%
 Prefetch data I/O hits percentage
 .It Sy pdm%
 Prefetch data miss percentage
 .It Sy pmh%
 Prefetch metadata hits percentage
 .It Sy pmi%
 Prefetch metadata I/O hits percentage
 .It Sy pmm%
 Prefetch metadata miss percentage
 .It Sy dhit
 Demand hits per second
 .It Sy dioh
 Demand I/O hits per second
 .It Sy dmis
 Demand misses per second
 .It Sy ddhit
 Demand data hits per second
 .It Sy ddioh
 Demand data I/O hits per second
 .It Sy ddmis
 Demand data misses per second
 .It Sy dmhit
 Demand metadata hits per second
 .It Sy dmioh
 Demand metadata I/O hits per second
 .It Sy dmmis
 Demand metadata misses per second
 .It Sy hit%
 ARC hit percentage
 .It Sy hits
 ARC hits per second
 .It Sy ioh%
 ARC I/O hits percentage
 .It Sy iohs
 ARC I/O hits per second
 .It Sy mfug
 MFU ghost list hits per second
 .It Sy mhit
 Metadata hits per second
 .It Sy mioh
 Metadata I/O hits per second
 .It Sy miss
 ARC misses per second
 .It Sy mmis
 Metadata misses per second
 .It Sy mrug
 MRU ghost list hits per second
 .It Sy phit
 Prefetch hits per second
 .It Sy pioh
 Prefetch I/O hits per second
 .It Sy pmis
 Prefetch misses per second
 .It Sy pdhit
 Prefetch data hits per second
 .It Sy pdioh
 Prefetch data I/O hits per second
 .It Sy pdmis
 Prefetch data misses per second
 .It Sy pmhit
 Prefetch metadata hits per second
 .It Sy pmioh
 Prefetch metadata I/O hits per second
 .It Sy pmmis
 Prefetch metadata misses per second
 .It Sy read
 Total ARC accesses per second
 .It Sy time
 Current time
 .It Sy size
 ARC size
 .It Sy arcsz
 Alias for
 .Sy size
 .It Sy unc
 Uncached list hits per second
 .It Sy dread
 Demand accesses per second
 .It Sy ddread
 Demand data accesses per second
 .It Sy dmread
 Demand metadata accesses per second
 .It Sy eskip
 evict_skip per second
 .It Sy miss%
 ARC miss percentage
 .It Sy mread
 Metadata accesses per second
 .It Sy pread
 Prefetch accesses per second
 .It Sy pdread
 Prefetch data accesses per second
 .It Sy pmread
 Prefetch metadata accesses per second
 .It Sy l2hit%
 L2ARC access hit percentage
 .It Sy l2hits
 L2ARC hits per second
 .It Sy l2miss
 L2ARC misses per second
 .It Sy l2read
 Total L2ARC accesses per second
 .It Sy l2pref
 L2ARC prefetch allocated size per second
 .It Sy l2pref%
 L2ARC prefetch allocated size percentage
 .It Sy l2mfu
 L2ARC MFU allocated size per second
 .It Sy l2mfu%
 L2ARC MFU allocated size percentage
 .It Sy l2mru
 L2ARC MRU allocated size per second
 .It Sy l2mru%
 L2ARC MRU allocated size percentage
 .It Sy l2data
 L2ARC data (buf content) allocated size per second
 .It Sy l2data%
 L2ARC data (buf content) allocated size percentage
 .It Sy l2meta
 L2ARC metadata (buf content) allocated size per second
 .It Sy l2meta%
 L2ARC metadata (buf content) allocated size percentage
 .It Sy l2size
 Size of the L2ARC
 .It Sy mtxmis
 mutex_miss per second
 .It Sy l2bytes
 Bytes read per second from the L2ARC
 .It Sy l2wbytes
 Bytes written per second to the L2ARC
 .It Sy l2miss%
 L2ARC access miss percentage
 .It Sy l2asize
 Actual (compressed) size of the L2ARC
 .It Sy cmpsz
 Compressed size
 .It Sy cmpsz%
 Compressed size percentage
 .It Sy ovhsz
 Overhead size
 .It Sy ovhsz%
 Overhead size percentage
 .It Sy bonsz
 Bonus size
 .It Sy bonsz%
 Bonus size percentage
 .It Sy dnosz
 Dnode size
 .It Sy dnosz%
 Dnode size percentage
 .It Sy dbusz
 Dbuf size
 .It Sy dbusz%
 Dbuf size percentage
 .It Sy hdrsz
 Header size
 .It Sy hdrsz%
 Header size percentage
 .It Sy l2hsz
 L2 header size
 .It Sy l2hsz%
 L2 header size percentage
 .It Sy abdsz
 ABD chunk waste size
 .It Sy abdsz%
 ABD chunk waste size percentage
 .It Sy datatg
 ARC data target
 .It Sy datatg%
 ARC data target percentage
 .It Sy datasz
 ARC data size
 .It Sy datasz%
 ARC data size percentage
 .It Sy metatg
 ARC metadata target
 .It Sy metatg%
 ARC metadata target percentage
 .It Sy metasz
 ARC metadata size
 .It Sy metasz%
 ARC metadata size percentage
 .It Sy anosz
 Anonymous size
 .It Sy anosz%
 Anonymous size percentage
 .It Sy anoda
 Anonymous data size
 .It Sy anoda%
 Anonymous data size percentage
 .It Sy anome
 Anonymous metadata size
 .It Sy anome%
 Anonymous metadata size percentage
 .It Sy anoed
 Anonymous evictable data size
 .It Sy anoed%
 Anonymous evictable data size percentage
 .It Sy anoem
 Anonymous evictable metadata size
 .It Sy anoem%
 Anonymous evictable metadata size percentage
 .It Sy mfutg
 MFU target
 .It Sy mfutg%
 MFU target percentage
 .It Sy mfudt
 MFU data target
 .It Sy mfudt%
 MFU data target percentage
 .It Sy mfumt
 MFU metadata target
 .It Sy mfumt%
 MFU metadata target percentage
 .It Sy mfusz
 MFU size
 .It Sy mfusz%
 MFU size percentage
 .It Sy mfuda
 MFU data size
 .It Sy mfuda%
 MFU data size percentage
 .It Sy mfume
 MFU metadata size
 .It Sy mfume%
 MFU metadata size percentage
 .It Sy mfued
 MFU evictable data size
 .It Sy mfued%
 MFU evictable data size percentage
 .It Sy mfuem
 MFU evictable metadata size
 .It Sy mfuem%
 MFU evictable metadata size percentage
 .It Sy mfugsz
 MFU ghost size
 .It Sy mfugd
 MFU ghost data size
 .It Sy mfugm
 MFU ghost metadata size
 .It Sy mrutg
 MRU target
 .It Sy mrutg%
 MRU target percentage
 .It Sy mrudt
 MRU data target
 .It Sy mrudt%
 MRU data target percentage
 .It Sy mrumt
 MRU metadata target
 .It Sy mrumt%
 MRU metadata target percentage
 .It Sy mrusz
 MRU size
 .It Sy mrusz%
 MRU size percentage
 .It Sy mruda
 MRU data size
 .It Sy mruda%
 MRU data size percentage
 .It Sy mrume
 MRU metadata size
 .It Sy mrume%
 MRU metadata size percentage
 .It Sy mrued
 MRU evictable data size
 .It Sy mrued%
 MRU evictable data size percentage
 .It Sy mruem
 MRU evictable metadata size
 .It Sy mruem%
 MRU evictable metadata size percentage
 .It Sy mrugsz
 MRU ghost size
 .It Sy mrugd
 MRU ghost data size
 .It Sy mrugm
 MRU ghost metadata size
 .It Sy uncsz
 Uncached size
 .It Sy uncsz%
 Uncached size percentage
 .It Sy uncda
 Uncached data size
 .It Sy uncda%
 Uncached data size percentage
 .It Sy uncme
 Uncached metadata size
 .It Sy uncme%
 Uncached metadata size percentage
 .It Sy unced
 Uncached evictable data size
 .It Sy unced%
 Uncached evictable data size percentage
 .It Sy uncem
 Uncached evictable metadata size
 .It Sy uncem%
 Uncached evictable metadata size percentage
 .It Sy grow
 ARC grow disabled
 .It Sy need
 ARC reclaim needed
 .It Sy free
 The ARC's idea of how much free memory there is, which includes evictable memory
 in the page cache.
 Since the ARC tries to keep
 .Sy avail
 above zero,
 .Sy avail
 is usually more instructive to observe than
 .Sy free .
 .It Sy avail
 The ARC's idea of how much free memory is available to it, which is a bit less
 than
 .Sy free .
 May temporarily be negative, in which case the ARC will reduce the target size
 .Sy c .
 .El
 .
 .Sh OPTIONS
 .Bl -tag -width "-v"
 .It Fl a
 Print all possible stats.
 .It Fl f
 Display only specific fields.
 See
 .Sx DESCRIPTION
 for supported statistics.
 .It Fl h
 Display help message.
 .It Fl o
 Report statistics to a file instead of the standard output.
 .It Fl p
 Disable auto-scaling of numerical fields (for raw, machine-parsable values).
 .It Fl s
 Display data with a specified separator (default: 2 spaces).
 .It Fl x
 Print extended stats
 .Pq same as Fl f Sy time , Ns Sy mfu , Ns Sy mru , Ns Sy mfug , Ns Sy mrug , Ns Sy eskip , Ns Sy mtxmis , Ns Sy dread , Ns Sy pread , Ns Sy read .
 .It Fl v
 Show field headers and definitions
 .El
 .
 .Sh OPERANDS
 The following operands are supported:
 .Bl -tag -compact -offset Ds -width "interval"
 .It Ar interval
 Specify the sampling interval in seconds.
 .It Ar count
 Display only
 .Ar count
 reports.
 .El
diff --git a/sys/contrib/openzfs/man/man1/cstyle.1 b/sys/contrib/openzfs/man/man1/cstyle.1
index 241c82edd5a8..8f29129ce175 100644
--- a/sys/contrib/openzfs/man/man1/cstyle.1
+++ b/sys/contrib/openzfs/man/man1/cstyle.1
@@ -1,153 +1,153 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\" Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 .\" Use is subject to license terms.
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
-.Dd May 26, 2021
+.Dd April 4, 2022
 .Dt CSTYLE 1
 .Os
 .
 .Sh NAME
 .Nm cstyle
 .Nd check for some common stylistic errors in C source files
 .Sh SYNOPSIS
 .Nm
 .Op Fl chpvCP
 .Oo Ar file Oc Ns …
 .Sh DESCRIPTION
 .Nm
 inspects C source files (*.c and *.h) for common stylistic errors.
 It attempts to check for the cstyle documented in
 .Lk http://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf .
 Note that there is much in that document that
 .Em cannot
 be checked for; just because your code is
 .Nm Ns -clean
 does not mean that you've followed Sun's C style.
 .Em Caveat emptor .
 .
 .Sh OPTIONS
 .Bl -tag -width "-c"
 .It Fl c
 Check continuation line indentation inside of functions.
 Sun's C style
 states that all statements must be indented to an appropriate tab stop,
 and any continuation lines after them must be indented
 .Em exactly
 four spaces from the start line.
 This option enables a series of checks designed to find
 continuation line problems within functions only.
 The checks have some limitations; see
 .Sy CONTINUATION CHECKING ,
 below.
 .It Fl p
 Performs some of the more picky checks.
 Includes ANSI
 .Sy #else
 and
 .Sy #endif
 rules, and tries to detect spaces after casts.
 Used as part of the putback checks.
 .It Fl v
 Verbose output; includes the text of the line of error, and, for
 .Fl c ,
 the first statement in the current continuation block.
 .It Fl P
 Check for use of non-POSIX types.
 Historically, types like
 .Sy u_int
 and
 .Sy u_long
 were used, but they are now deprecated in favor of the POSIX
 types
 .Sy uint_t ,
 .Sy ulong_t ,
 etc.
 This detects any use of the deprecated types.
 Used as part of the putback checks.
 .It Fl g
 Also print GitHub-Actions-style
 .Li ::error
 output.
 .El
 .
 .Sh ENVIRONMENT
 .Bl -tag -compact -width ".Ev CI"
 .It Ev CI
 If set and nonempty, equivalent to
 .Fl g .
 .El
 .
 .Sh CONTINUATION CHECKING
 The continuation checker is a reasonably simple state machine that knows
 something about how C is laid out, and can match parenthesis, etc. over
 multiple lines.
 It does have some limitations:
 .Bl -enum
 .It
 Preprocessor macros which cause unmatched parenthesis will confuse the
 checker for that line.
 To fix this, you'll need to make sure that each branch of the
 .Sy #if
 statement has balanced parenthesis.
 .It
 Some
 .Xr cpp 1
 macros do not require
 .Sy ;\& Ns s after them.
 Any such macros
 .Em must
 be ALL_CAPS; any lower case letters will cause bad output.
 .Pp
 The bad output will generally be corrected after the next
 .Sy ;\& , { , No or Sy } .
 .El
 Some continuation error messages deserve some additional explanation:
 .Bl -tag -width Ds
 .It Sy multiple statements continued over multiple lines
 A multi-line statement which is not broken at statement boundaries.
 For example:
 .Bd -literal -compact -offset Ds
 if (this_is_a_long_variable == another_variable) a =
     b + c;
 .Ed
 .Pp
 Will trigger this error.
 Instead, do:
 .Bd -literal -compact -offset Ds
 if (this_is_a_long_variable == another_variable)
     a = b + c;
 .Ed
 .It Sy empty if/for/while body not on its own line
 For visibility, empty bodies for if, for, and while statements should be
 on their own line.
 For example:
 .Bd -literal -compact -offset Ds
 while (do_something(&x) == 0);
 .Ed
 .Pp
 Will trigger this error.
 Instead, do:
 .Bd -literal -compact -offset Ds
 while (do_something(&x) == 0)
     ;
 .Ed
 .El
diff --git a/sys/contrib/openzfs/man/man1/zhack.1 b/sys/contrib/openzfs/man/man1/zhack.1
index f58c0527649b..743bd53b731c 100644
--- a/sys/contrib/openzfs/man/man1/zhack.1
+++ b/sys/contrib/openzfs/man/man1/zhack.1
@@ -1,168 +1,168 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright 2013 Darik Horn <dajhorn@vanadac.com>. All rights reserved.
 .\"
 .\" lint-ok: WARNING: sections out of conventional order: Sh SYNOPSIS
 .\"
-.Dd May 26, 2021
+.Dd May 3, 2023
 .Dt ZHACK 1
 .Os
 .
 .Sh NAME
 .Nm zhack
 .Nd libzpool debugging tool
 .Sh DESCRIPTION
 This utility pokes configuration changes directly into a ZFS pool,
 which is dangerous and can cause data corruption.
 .Sh SYNOPSIS
 .Bl -tag -width Ds
 .It Xo
 .Nm zhack
 .Cm feature stat
 .Ar pool
 .Xc
 List feature flags.
 .
 .It Xo
 .Nm zhack
 .Cm feature enable
 .Op Fl d Ar description
 .Op Fl r
 .Ar pool
 .Ar guid
 .Xc
 Add a new feature to
 .Ar pool
 that is uniquely identified by
 .Ar guid ,
 which is specified in the same form as a
 .Xr zfs 8
 user property.
 .Pp
 The
 .Ar description
 is a short human readable explanation of the new feature.
 .Pp
 The
 .Fl r
 flag indicates that
 .Ar pool
 can be safely opened in read-only mode by a system that does not understand the
 .Ar guid
 feature.
 .
 .It Xo
 .Nm zhack
 .Cm feature ref
 .Op Fl d Ns | Ns Fl m
 .Ar pool
 .Ar guid
 .Xc
 Increment the reference count of the
 .Ar guid
 feature in
 .Ar pool .
 .Pp
 The
 .Fl d
 flag decrements the reference count of the
 .Ar guid
 feature in
 .Ar pool
 instead.
 .Pp
 The
 .Fl m
 flag indicates that the
 .Ar guid
 feature is now required to read the pool MOS.
 .
 .It Xo
 .Nm zhack
 .Cm label repair
 .Op Fl cu
 .Ar device
 .Xc
 Repair labels of a specified
 .Ar device
 according to options.
 .Pp
 Flags may be combined to do their functions simultaneously.
 .
 .Pp
 The
 .Fl c
 flag repairs corrupted label checksums
 .
 .Pp
 The
 .Fl u
 flag restores the label on a detached device
 .Pp
 Example:
 .Nm zhack Cm label repair Fl cu Ar device
   Fix checksums and undetach a device
 .
 .El
 .
 .Sh GLOBAL OPTIONS
 The following can be passed to all
 .Nm
 invocations before any subcommand:
 .Bl -tag -width "-d dir"
 .It Fl c Ar cachefile
 Read
 .Ar pool
 configuration from the
 .Ar cachefile ,
 which is
 .Pa /etc/zfs/zpool.cache
 by default.
 .It Fl d Ar dir
 Search for
 .Ar pool
 members in
 .Ar dir .
 Can be specified more than once.
 .El
 .
 .Sh EXAMPLES
 .Bd -literal
 .No # Nm zhack Cm feature stat Ar tank
 for_read_obj:
 	org.illumos:lz4_compress = 0
 for_write_obj:
 	com.delphix:async_destroy = 0
 	com.delphix:empty_bpobj = 0
 descriptions_obj:
 	com.delphix:async_destroy = Destroy filesystems asynchronously.
 	com.delphix:empty_bpobj = Snapshots use less space.
 	org.illumos:lz4_compress = LZ4 compression algorithm support.
 
 .No # Nm zhack Cm feature enable Fl d No 'Predict future disk failures.' Ar tank com.example:clairvoyance
 .No # Nm zhack Cm feature ref Ar tank com.example:clairvoyance
 .Ed
 .
 .Sh SEE ALSO
 .Xr ztest 1 ,
 .Xr zpool-features 7 ,
 .Xr zfs 8
diff --git a/sys/contrib/openzfs/man/man1/ztest.1 b/sys/contrib/openzfs/man/man1/ztest.1
index febbb62b1664..ae857bfea29c 100644
--- a/sys/contrib/openzfs/man/man1/ztest.1
+++ b/sys/contrib/openzfs/man/man1/ztest.1
@@ -1,259 +1,259 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved.
 .\" Copyright (c) 2009 Michael Gebetsroither <michael.geb@gmx.at>. All rights
 .\" reserved.
 .\" Copyright (c) 2017, Intel Corporation.
 .\"
-.Dd May 26, 2021
+.Dd July 12, 2025
 .Dt ZTEST 1
 .Os
 .
 .Sh NAME
 .Nm ztest
 .Nd was written by the ZFS Developers as a ZFS unit test
 .Sh SYNOPSIS
 .Nm
 .Op Fl VEG
 .Op Fl v Ar vdevs
 .Op Fl s Ar size_of_each_vdev
 .Op Fl a Ar alignment_shift
 .Op Fl m Ar mirror_copies
 .Op Fl r Ar raidz_disks/draid_disks
 .Op Fl R Ar raid_parity
 .Op Fl K Ar raid_kind
 .Op Fl D Ar draid_data
 .Op Fl S Ar draid_spares
 .Op Fl C Ar vdev_class_state
 .Op Fl d Ar datasets
 .Op Fl t Ar threads
 .Op Fl g Ar gang_block_threshold
 .Op Fl i Ar initialize_pool_i_times
 .Op Fl k Ar kill_percentage
 .Op Fl p Ar pool_name
 .Op Fl T Ar time
 .Op Fl z Ar zil_failure_rate
 .
 .Nm
 .Fl X
 .Op Fl VG
 .Op Fl s Ar size_of_each_vdev
 .Op Fl a Ar alignment_shift
 .Op Fl r Ar raidz_disks
 .Op Fl R Ar raid_parity
 .Op Fl d Ar datasets
 .Op Fl t Ar threads
 .
 .Sh DESCRIPTION
 .Nm
 was written by the ZFS Developers as a ZFS unit test.
 The tool was developed in tandem with the ZFS functionality and was
 executed nightly as one of the many regression test against the daily build.
 As features were added to ZFS, unit tests were also added to
 .Nm .
 In addition, a separate test development team wrote and
 executed more functional and stress tests.
 .
 .Pp
 By default
 .Nm
 runs for ten minutes and uses block files
 (stored in
 .Pa /tmp )
 to create pools rather than using physical disks.
 Block files afford
 .Nm
 its flexibility to play around with
 zpool components without requiring large hardware configurations.
 However, storing the block files in
 .Pa /tmp
 may not work for you if you
 have a small tmp directory.
 .
 .Pp
 By default is non-verbose.
 This is why entering the command above will result in
 .Nm
 quietly executing for 5 minutes.
 The
 .Fl V
 option can be used to increase the verbosity of the tool.
 Adding multiple
 .Fl V
 options is allowed and the more you add the more chatty
 .Nm
 becomes.
 .
 .Pp
 After the
 .Nm
 run completes, you should notice many
 .Pa ztest.*
 files lying around.
 Once the run completes you can safely remove these files.
 Note that you shouldn't remove these files during a run.
 You can re-use these files in your next
 .Nm
 run by using the
 .Fl E
 option.
 .
 .Sh OPTIONS
 .Bl -tag -width "-v v"
 .It Fl h , \&? , -help
 Print a help summary.
 .It Fl v , -vdevs Ns = (default: Sy 5 )
 Number of vdevs.
 .It Fl s , -vdev-size Ns = (default: Sy 64M )
 Size of each vdev.
 .It Fl a , -alignment-shift Ns = (default: Sy 9 ) No (use Sy 0 No for random )
 Alignment shift used in test.
 .It Fl m , -mirror-copies Ns = (default: Sy 2 )
 Number of mirror copies.
 .It Fl r , -raid-disks Ns = (default: Sy 4 No for raidz/ Ns Sy 16 No for draid )
 Number of raidz/draid disks.
 .It Fl R , -raid-parity Ns = (default: Sy 1 )
 Raid parity (raidz & draid).
 .It Xo
 .Fl K , -raid-kind Ns = Ns
 .Sy raidz Ns | Ns Sy eraidz Ns | Ns Sy draid Ns | Ns Sy random
 (default:
 .Sy random Ns
 )
 .Xc
 The kind of RAID config to use.
 With
 .Sy random
 the kind alternates between raidz, eraidz (expandable raidz) and draid.
 .It Fl D , -draid-data Ns = (default: Sy 4 )
 Number of data disks in a dRAID redundancy group.
 .It Fl S , -draid-spares Ns = (default: Sy 1 )
 Number of dRAID distributed spare disks.
 .It Fl d , -datasets Ns = (default: Sy 7 )
 Number of datasets.
 .It Fl t , -threads Ns = (default: Sy 23 )
 Number of threads.
 .It Fl g , -gang-block-threshold Ns = (default: Sy 32K )
 Gang block threshold.
 .It Fl i , -init-count Ns = (default: Sy 1 )
 Number of pool initializations.
 .It Fl k , -kill-percentage Ns = (default: Sy 70% )
 Kill percentage.
 .It Fl p , -pool-name Ns = (default: Sy ztest )
 Pool name.
 .It Fl f , -vdev-file-directory Ns = (default: Pa /tmp )
 File directory for vdev files.
 .It Fl M , -multi-host
 Multi-host; simulate pool imported on remote host.
 .It Fl E , -use-existing-pool
 Use existing pool (use existing pool instead of creating new one).
 .It Fl T , -run-time Ns = (default: Sy 300 Ns s)
 Total test run time.
 .It Fl P , -pass-time Ns = (default: Sy 60 Ns s)
 Time per pass.
 .It Fl F , -freeze-loops Ns = (default: Sy 50 )
 Max loops in
 .Fn spa_freeze .
 .It Fl B , -alt-ztest Ns =
 Path to alternate ("older")
 .Nm ztest
 to drive, which will be used to initialize the pool, and, a stochastic half the
 time, to run the tests.
 The parallel
 .Pa lib
 directory is prepended to
 .Ev LD_LIBRARY_PATH ;
 i.e. given
 .Fl B Pa ./chroots/lenny/usr/bin/ Ns Nm ,
 .Pa ./chroots/lenny/usr/lib
 will be loaded.
 .It Fl C , -vdev-class-state Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy random No (default : Sy random  )
 The vdev allocation class state.
 .It Fl o , -option Ns = Ns Ar var Ns = Ns Ar value Ns …
 Set the given tunable to the provided value.
 .It Fl G , -dump-debug
 Dump zfs_dbgmsg buffer before exiting due to an error.
 .It Fl V , -verbose
 Verbose (use multiple times for ever more verbosity).
 .It Fl X , -raidz-expansion
 Perform a dedicated raidz expansion test.
 .El
 .
 .Sh EXAMPLES
 To override
 .Pa /tmp
 as your location for block files, you can use the
 .Fl f
 option:
 .Dl # ztest -f /
 .Pp
 To get an idea of what
 .Nm
 is actually testing try this:
 .Dl # ztest -f / -VVV
 .Pp
 Maybe you'd like to run
 .Nm ztest
 for longer? To do so simply use the
 .Fl T
 option and specify the runlength in seconds like so:
 .Dl # ztest -f / -V -T 120
 .
 .Sh ENVIRONMENT VARIABLES
 .Bl -tag -width "ZF"
 .It Ev ZFS_HOSTID Ns = Ns Em id
 Use
 .Em id
 instead of the SPL hostid to identify this host.
 Intended for use with
 .Nm , but this environment variable will affect any utility which uses
 libzpool, including
 .Xr zpool 8 .
 Since the kernel is unaware of this setting,
 results with utilities other than ztest are undefined.
 .It Ev ZFS_STACK_SIZE Ns = Ns Em stacksize
 Limit the default stack size to
 .Em stacksize
 bytes for the purpose of
 detecting and debugging kernel stack overflows.
 This value defaults to
 .Em 32K
 which is double the default
 .Em 16K
 Linux kernel stack size.
 .Pp
 In practice, setting the stack size slightly higher is needed because
 differences in stack usage between kernel and user space can lead to spurious
 stack overflows (especially when debugging is enabled).
 The specified value
 will be rounded up to a floor of PTHREAD_STACK_MIN which is the minimum stack
 required for a NULL procedure in user space.
 .Pp
 By default the stack size is limited to
 .Em 256K .
 .El
 .
 .Sh SEE ALSO
 .Xr zdb 1 ,
 .Xr zfs 1 ,
 .Xr zpool 1 ,
 .Xr spl 4
diff --git a/sys/contrib/openzfs/man/man4/spl.4 b/sys/contrib/openzfs/man/man4/spl.4
index 683f8e2b631f..61dfe42e463d 100644
--- a/sys/contrib/openzfs/man/man4/spl.4
+++ b/sys/contrib/openzfs/man/man4/spl.4
@@ -1,183 +1,183 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" Copyright 2013 Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\"
-.Dd August 24, 2020
+.Dd May 7, 2025
 .Dt SPL 4
 .Os
 .
 .Sh NAME
 .Nm spl
 .Nd parameters of the SPL kernel module
 .
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Sy spl_kmem_cache_kmem_threads Ns = Ns Sy 4 Pq uint
 The number of threads created for the spl_kmem_cache task queue.
 This task queue is responsible for allocating new slabs
 for use by the kmem caches.
 For the majority of systems and workloads only a small number of threads are
 required.
 .
 .It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint
 The preferred number of objects per slab in the cache.
 In general, a larger value will increase the caches memory footprint
 while decreasing the time required to perform an allocation.
 Conversely, a smaller value will minimize the footprint
 and improve cache reclaim time but individual allocations may take longer.
 .
 .It Sy spl_kmem_cache_max_size Ns = Ns Sy 32 Po 64-bit Pc or Sy 4 Po 32-bit Pc Pq uint
 The maximum size of a kmem cache slab in MiB.
 This effectively limits the maximum cache object size to
 .Sy spl_kmem_cache_max_size Ns / Ns Sy spl_kmem_cache_obj_per_slab .
 .Pp
 Caches may not be created with
 object sized larger than this limit.
 .
 .It Sy spl_kmem_cache_slab_limit Ns = Ns Sy 16384 Pq uint
 For small objects the Linux slab allocator should be used to make the most
 efficient use of the memory.
 However, large objects are not supported by
 the Linux slab and therefore the SPL implementation is preferred.
 This value is used to determine the cutoff between a small and large object.
 .Pp
 Objects of size
 .Sy spl_kmem_cache_slab_limit
 or smaller will be allocated using the Linux slab allocator,
 large objects use the SPL allocator.
 A cutoff of 16K was determined to be optimal for architectures using 4K pages.
 .
 .It Sy spl_kmem_alloc_warn Ns = Ns Sy 32768 Pq uint
 As a general rule
 .Fn kmem_alloc
 allocations should be small,
 preferably just a few pages, since they must by physically contiguous.
 Therefore, a rate limited warning will be printed to the console for any
 .Fn kmem_alloc
 which exceeds a reasonable threshold.
 .Pp
 The default warning threshold is set to eight pages but capped at 32K to
 accommodate systems using large pages.
 This value was selected to be small enough to ensure
 the largest allocations are quickly noticed and fixed.
 But large enough to avoid logging any warnings when a allocation size is
 larger than optimal but not a serious concern.
 Since this value is tunable, developers are encouraged to set it lower
 when testing so any new largish allocations are quickly caught.
 These warnings may be disabled by setting the threshold to zero.
 .
 .It Sy spl_kmem_alloc_max Ns = Ns Sy KMALLOC_MAX_SIZE Ns / Ns Sy 4 Pq uint
 Large
 .Fn kmem_alloc
 allocations will fail if they exceed
 .Sy KMALLOC_MAX_SIZE .
 Allocations which are marginally smaller than this limit may succeed but
 should still be avoided due to the expense of locating a contiguous range
 of free pages.
 Therefore, a maximum kmem size with reasonable safely margin of 4x is set.
 .Fn kmem_alloc
 allocations larger than this maximum will quickly fail.
 .Fn vmem_alloc
 allocations less than or equal to this value will use
 .Fn kmalloc ,
 but shift to
 .Fn vmalloc
 when exceeding this value.
 .
 .It Sy spl_kmem_cache_magazine_size Ns = Ns Sy 0 Pq uint
 Cache magazines are an optimization designed to minimize the cost of
 allocating memory.
 They do this by keeping a per-CPU cache of recently
 freed objects, which can then be reallocated without taking a lock.
 This can improve performance on highly contended caches.
 However, because objects in magazines will prevent otherwise empty slabs
 from being immediately released this may not be ideal for low memory machines.
 .Pp
 For this reason,
 .Sy spl_kmem_cache_magazine_size
 can be used to set a maximum magazine size.
 When this value is set to 0 the magazine size will
 be automatically determined based on the object size.
 Otherwise magazines will be limited to 2-256 objects per magazine (i.e. per
 CPU). Magazines may never be entirely disabled in this implementation.
 .
 .It Sy spl_hostid Ns = Ns Sy 0 Pq ulong
 The system hostid, when set this can be used to uniquely identify a system.
 By default this value is set to zero which indicates the hostid is disabled.
 It can be explicitly enabled by placing a unique non-zero value in
 .Pa /etc/hostid .
 .
 .It Sy spl_hostid_path Ns = Ns Pa /etc/hostid Pq charp
 The expected path to locate the system hostid when specified.
 This value may be overridden for non-standard configurations.
 .
 .It Sy spl_panic_halt Ns = Ns Sy 0 Pq uint
 Cause a kernel panic on assertion failures.
 When not enabled, the thread is halted to facilitate further debugging.
 .Pp
 Set to a non-zero value to enable.
 .
 .It Sy spl_taskq_kick Ns = Ns Sy 0 Pq uint
 Kick stuck taskq to spawn threads.
 When writing a non-zero value to it, it will scan all the taskqs.
 If any of them have a pending task more than 5 seconds old,
 it will kick it to spawn more threads.
 This can be used if you find a rare
 deadlock occurs because one or more taskqs didn't spawn a thread when it should.
 .
 .It Sy spl_taskq_thread_bind Ns = Ns Sy 0 Pq int
 Bind taskq threads to specific CPUs.
 When enabled all taskq threads will be distributed evenly
 across the available CPUs.
 By default, this behavior is disabled to allow the Linux scheduler
 the maximum flexibility to determine where a thread should run.
 .
 .It Sy spl_taskq_thread_dynamic Ns = Ns Sy 1 Pq int
 Allow dynamic taskqs.
 When enabled taskqs which set the
 .Sy TASKQ_DYNAMIC
 flag will by default create only a single thread.
 New threads will be created on demand up to a maximum allowed number
 to facilitate the completion of outstanding tasks.
 Threads which are no longer needed will be promptly destroyed.
 By default this behavior is enabled but it can be disabled to
 aid performance analysis or troubleshooting.
 .
 .It Sy spl_taskq_thread_priority Ns = Ns Sy 1 Pq int
 Allow newly created taskq threads to set a non-default scheduler priority.
 When enabled, the priority specified when a taskq is created will be applied
 to all threads created by that taskq.
 When disabled all threads will use the default Linux kernel thread priority.
 By default, this behavior is enabled.
 .
 .It Sy spl_taskq_thread_sequential Ns = Ns Sy 4 Pq int
 The number of items a taskq worker thread must handle without interruption
 before requesting a new worker thread be spawned.
 This is used to control
 how quickly taskqs ramp up the number of threads processing the queue.
 Because Linux thread creation and destruction are relatively inexpensive a
 small default value has been selected.
 This means that normally threads will be created aggressively which is
 desirable.
 Increasing this value will
 result in a slower thread creation rate which may be preferable for some
 configurations.
 .
 .It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 5000 Pq uint
 Minimum idle threads exit interval for dynamic taskqs.
 Smaller values allow idle threads exit more often and potentially be
 respawned again on demand, causing more churn.
 .El
diff --git a/sys/contrib/openzfs/man/man4/zfs.4 b/sys/contrib/openzfs/man/man4/zfs.4
index 5c7958667f92..e865d6a79c5a 100644
--- a/sys/contrib/openzfs/man/man4/zfs.4
+++ b/sys/contrib/openzfs/man/man4/zfs.4
@@ -1,2918 +1,2918 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
 .\" Copyright (c) 2019, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2019 Datto Inc.
 .\" Copyright (c) 2023, 2024, 2025, Klara, Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\"
-.Dd May 29, 2025
+.Dd August 14, 2025
 .Dt ZFS 4
 .Os
 .
 .Sh NAME
 .Nm zfs
 .Nd tuning of the ZFS kernel module
 .
 .Sh DESCRIPTION
 The ZFS module supports these parameters:
 .Bl -tag -width Ds
 .It Sy dbuf_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64
 Maximum size in bytes of the dbuf cache.
 The target size is determined by the MIN versus
 .No 1/2^ Ns Sy dbuf_cache_shift Pq 1/32nd
 of the target ARC size.
 The behavior of the dbuf cache and its associated settings
 can be observed via the
 .Pa /proc/spl/kstat/zfs/dbufstats
 kstat.
 .
 .It Sy dbuf_metadata_cache_max_bytes Ns = Ns Sy UINT64_MAX Ns B Pq u64
 Maximum size in bytes of the metadata dbuf cache.
 The target size is determined by the MIN versus
 .No 1/2^ Ns Sy dbuf_metadata_cache_shift Pq 1/64th
 of the target ARC size.
 The behavior of the metadata dbuf cache and its associated settings
 can be observed via the
 .Pa /proc/spl/kstat/zfs/dbufstats
 kstat.
 .
 .It Sy dbuf_cache_hiwater_pct Ns = Ns Sy 10 Ns % Pq uint
 The percentage over
 .Sy dbuf_cache_max_bytes
 when dbufs must be evicted directly.
 .
 .It Sy dbuf_cache_lowater_pct Ns = Ns Sy 10 Ns % Pq uint
 The percentage below
 .Sy dbuf_cache_max_bytes
 when the evict thread stops evicting dbufs.
 .
 .It Sy dbuf_cache_shift Ns = Ns Sy 5 Pq uint
 Set the size of the dbuf cache
 .Pq Sy dbuf_cache_max_bytes
 to a log2 fraction of the target ARC size.
 .
 .It Sy dbuf_metadata_cache_shift Ns = Ns Sy 6 Pq uint
 Set the size of the dbuf metadata cache
 .Pq Sy dbuf_metadata_cache_max_bytes
 to a log2 fraction of the target ARC size.
 .
 .It Sy dbuf_mutex_cache_shift Ns = Ns Sy 0 Pq uint
 Set the size of the mutex array for the dbuf cache.
 When set to
 .Sy 0
 the array is dynamically sized based on total system memory.
 .
 .It Sy dmu_object_alloc_chunk_shift Ns = Ns Sy 7 Po 128 Pc Pq uint
 dnode slots allocated in a single operation as a power of 2.
 The default value minimizes lock contention for the bulk operation performed.
 .
 .It Sy dmu_ddt_copies Ns = Ns Sy 3 Pq uint
 Controls the number of copies stored for DeDup Table
 .Pq DDT
 objects.
 Reducing the number of copies to 1 from the previous default of 3
 can reduce the write inflation caused by deduplication.
 This assumes redundancy for this data is provided by the vdev layer.
 If the DDT is damaged, space may be leaked
 .Pq not freed
 when the DDT can not report the correct reference count.
 .
 .It Sy dmu_prefetch_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Limit the amount we can prefetch with one call to this amount in bytes.
 This helps to limit the amount of memory that can be used by prefetching.
 .
 .It Sy l2arc_feed_again Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Turbo L2ARC warm-up.
 When the L2ARC is cold the fill interval will be set as fast as possible.
 .
 .It Sy l2arc_feed_min_ms Ns = Ns Sy 200 Pq u64
 Min feed interval in milliseconds.
 Requires
 .Sy l2arc_feed_again Ns = Ns Ar 1
 and only applicable in related situations.
 .
 .It Sy l2arc_feed_secs Ns = Ns Sy 1 Pq u64
 Seconds between L2ARC writing.
 .
 .It Sy l2arc_headroom Ns = Ns Sy 8 Pq u64
 How far through the ARC lists to search for L2ARC cacheable content,
 expressed as a multiplier of
 .Sy l2arc_write_max .
 ARC persistence across reboots can be achieved with persistent L2ARC
 by setting this parameter to
 .Sy 0 ,
 allowing the full length of ARC lists to be searched for cacheable content.
 .
 .It Sy l2arc_headroom_boost Ns = Ns Sy 200 Ns % Pq u64
 Scales
 .Sy l2arc_headroom
 by this percentage when L2ARC contents are being successfully compressed
 before writing.
 A value of
 .Sy 100
 disables this feature.
 .
 .It Sy l2arc_exclude_special Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Controls whether buffers present on special vdevs are eligible for caching
 into L2ARC.
 If set to 1, exclude dbufs on special vdevs from being cached to L2ARC.
 .
 .It Sy l2arc_mfuonly Ns = Ns Sy 0 Ns | Ns 1 Ns | Ns 2 Pq int
 Controls whether only MFU metadata and data are cached from ARC into L2ARC.
 This may be desired to avoid wasting space on L2ARC when reading/writing large
 amounts of data that are not expected to be accessed more than once.
 .Pp
 The default is 0,
 meaning both MRU and MFU data and metadata are cached.
 When turning off this feature (setting it to 0), some MRU buffers will
 still be present in ARC and eventually cached on L2ARC.
 .No If Sy l2arc_noprefetch Ns = Ns Sy 0 ,
 some prefetched buffers will be cached to L2ARC, and those might later
 transition to MRU, in which case the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
 Setting it to 1 means to L2 cache only MFU data and metadata.
 .Pp
 Setting it to 2 means to L2 cache all metadata (MRU+MFU) but
 only MFU data (i.e. MRU data are not cached). This can be the right setting
 to cache as much metadata as possible even when having high data turnover.
 .Pp
 Regardless of
 .Sy l2arc_noprefetch ,
 some MFU buffers might be evicted from ARC,
 accessed later on as prefetches and transition to MRU as prefetches.
 If accessed again they are counted as MRU and the
 .Sy l2arc_mru_asize No arcstat will not be Sy 0 .
 .Pp
 The ARC status of L2ARC buffers when they were first cached in
 L2ARC can be seen in the
 .Sy l2arc_mru_asize , Sy l2arc_mfu_asize , No and Sy l2arc_prefetch_asize
 arcstats when importing the pool or onlining a cache
 device if persistent L2ARC is enabled.
 .Pp
 The
 .Sy evict_l2_eligible_mru
 arcstat does not take into account if this option is enabled as the information
 provided by the
 .Sy evict_l2_eligible_m[rf]u
 arcstats can be used to decide if toggling this option is appropriate
 for the current workload.
 .
 .It Sy l2arc_meta_percent Ns = Ns Sy 33 Ns % Pq uint
 Percent of ARC size allowed for L2ARC-only headers.
 Since L2ARC buffers are not evicted on memory pressure,
 too many headers on a system with an irrationally large L2ARC
 can render it slow or unusable.
 This parameter limits L2ARC writes and rebuilds to achieve the target.
 .
 .It Sy l2arc_trim_ahead Ns = Ns Sy 0 Ns % Pq u64
 Trims ahead of the current write size
 .Pq Sy l2arc_write_max
 on L2ARC devices by this percentage of write size if we have filled the device.
 If set to
 .Sy 100
 we TRIM twice the space required to accommodate upcoming writes.
 A minimum of
 .Sy 64 MiB
 will be trimmed.
 It also enables TRIM of the whole L2ARC device upon creation
 or addition to an existing pool or if the header of the device is
 invalid upon importing a pool or onlining a cache device.
 A value of
 .Sy 0
 disables TRIM on L2ARC altogether and is the default as it can put significant
 stress on the underlying storage devices.
 This will vary depending of how well the specific device handles these commands.
 .
 .It Sy l2arc_noprefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Do not write buffers to L2ARC if they were prefetched but not used by
 applications.
 In case there are prefetched buffers in L2ARC and this option
 is later set, we do not read the prefetched buffers from L2ARC.
 Unsetting this option is useful for caching sequential reads from the
 disks to L2ARC and serve those reads from L2ARC later on.
 This may be beneficial in case the L2ARC device is significantly faster
 in sequential reads than the disks of the pool.
 .Pp
 Use
 .Sy 1
 to disable and
 .Sy 0
 to enable caching/reading prefetches to/from L2ARC.
 .
 .It Sy l2arc_norw Ns = Ns Sy 0 Ns | Ns 1 Pq int
 No reads during writes.
 .
 .It Sy l2arc_write_boost Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
 Cold L2ARC devices will have
 .Sy l2arc_write_max
 increased by this amount while they remain cold.
 .
 .It Sy l2arc_write_max Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
 Max write bytes per interval.
 .
 .It Sy l2arc_rebuild_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Rebuild the L2ARC when importing a pool (persistent L2ARC).
 This can be disabled if there are problems importing a pool
 or attaching an L2ARC device (e.g. the L2ARC device is slow
 in reading stored log metadata, or the metadata
 has become somehow fragmented/unusable).
 .
 .It Sy l2arc_rebuild_blocks_min_l2size Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
 Minimum size of an L2ARC device required in order to write log blocks in it.
 The log blocks are used upon importing the pool to rebuild the persistent L2ARC.
 .Pp
 For L2ARC devices less than 1 GiB, the amount of data
 .Fn l2arc_evict
 evicts is significant compared to the amount of restored L2ARC data.
 In this case, do not write log blocks in L2ARC in order not to waste space.
 .
 .It Sy metaslab_aliquot Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq u64
 Metaslab group's per child vdev allocation granularity, in bytes.
 This is roughly similar to what would be referred to as the "stripe size"
 in traditional RAID arrays.
 In normal operation, ZFS will try to write this amount of data to each child
 of a top-level vdev before moving on to the next top-level vdev.
 .
 .It Sy metaslab_bias_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable metaslab groups biasing based on their over- or under-utilization
 relative to the metaslab class average.
 If disabled, each metaslab group will receive allocations proportional to its
 capacity.
 .
 .It Sy metaslab_perf_bias Ns = Ns Sy 1 Ns | Ns 0 Ns | Ns 2 Pq int
 Controls metaslab groups biasing based on their write performance.
 Setting to 0 makes all metaslab groups receive fixed amounts of allocations.
 Setting to 2 allows faster metaslab groups to allocate more.
 Setting to 1 equals to 2 if the pool is write-bound or 0 otherwise.
 That is, if the pool is limited by write throughput, then allocate more from
 faster metaslab groups, but if not, try to evenly distribute the allocations.
 .
 .It Sy metaslab_force_ganging Ns = Ns Sy 16777217 Ns B Po 16 MiB + 1 B Pc Pq u64
 Make some blocks above a certain size be gang blocks.
 This option is used by the test suite to facilitate testing.
 .
 .It Sy metaslab_force_ganging_pct Ns = Ns Sy 3 Ns % Pq uint
 For blocks that could be forced to be a gang block (due to
 .Sy metaslab_force_ganging ) ,
 force this many of them to be gang blocks.
 .
 .It Sy brt_zap_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Controls prefetching BRT records for blocks which are going to be cloned.
 .
 .It Sy brt_zap_default_bs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
 Default BRT ZAP data block size as a power of 2. Note that changing this after
 creating a BRT on the pool will not affect existing BRTs, only newly created
 ones.
 .
 .It Sy brt_zap_default_ibs Ns = Ns Sy 12 Po 4 KiB Pc Pq int
 Default BRT ZAP indirect block size as a power of 2. Note that changing this
 after creating a BRT on the pool will not affect existing BRTs, only newly
 created ones.
 .
 .It Sy ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP data block size as a power of 2. Note that changing this after
 creating a DDT on the pool will not affect existing DDTs, only newly created
 ones.
 .
 .It Sy ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
 Default DDT ZAP indirect block size as a power of 2. Note that changing this
 after creating a DDT on the pool will not affect existing DDTs, only newly
 created ones.
 .
 .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int
 Default dnode block size as a power of 2.
 .
 .It Sy zfs_default_ibs Ns = Ns Sy 17 Po 128 KiB Pc Pq int
 Default dnode indirect block size as a power of 2.
 .
 .It Sy zfs_dio_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable Direct I/O.
 If this setting is 0, then all I/O requests will be directed through the ARC
 acting as though the dataset property
 .Sy direct
 was set to
 .Sy disabled .
 .
 .It Sy zfs_dio_strict Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Strictly enforce alignment for Direct I/O requests, returning
 .Sy EINVAL
 if not page-aligned instead of silently falling back to uncached I/O.
 .
 .It Sy zfs_history_output_max Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 When attempting to log an output nvlist of an ioctl in the on-disk history,
 the output will not be stored if it is larger than this size (in bytes).
 This must be less than
 .Sy DMU_MAX_ACCESS Pq 64 MiB .
 This applies primarily to
 .Fn zfs_ioc_channel_program Pq cf. Xr zfs-program 8 .
 .
 .It Sy zfs_keep_log_spacemaps_at_export Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prevent log spacemaps from being destroyed during pool exports and destroys.
 .
 .It Sy zfs_metaslab_segment_weight_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable/disable segment-based metaslab selection.
 .
 .It Sy zfs_metaslab_switch_threshold Ns = Ns Sy 2 Pq int
 When using segment-based metaslab selection, continue allocating
 from the active metaslab until this option's
 worth of buckets have been exhausted.
 .
 .It Sy metaslab_debug_load Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Load all metaslabs during pool import.
 .
 .It Sy metaslab_debug_unload Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prevent metaslabs from being unloaded.
 .
 .It Sy metaslab_fragmentation_factor_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable use of the fragmentation metric in computing metaslab weights.
 .
 .It Sy metaslab_df_max_search Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 Maximum distance to search forward from the last offset.
 Without this limit, fragmented pools can see
 .Em >100`000
 iterations and
 .Fn metaslab_block_picker
 becomes the performance limiting factor on high-performance storage.
 .Pp
 With the default setting of
 .Sy 16 MiB ,
 we typically see less than
 .Em 500
 iterations, even with very fragmented
 .Sy ashift Ns = Ns Sy 9
 pools.
 The maximum number of iterations possible is
 .Sy metaslab_df_max_search / 2^(ashift+1) .
 With the default setting of
 .Sy 16 MiB
 this is
 .Em 16*1024 Pq with Sy ashift Ns = Ns Sy 9
 or
 .Em 2*1024 Pq with Sy ashift Ns = Ns Sy 12 .
 .
 .It Sy metaslab_df_use_largest_segment Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If not searching forward (due to
 .Sy metaslab_df_max_search , metaslab_df_free_pct ,
 .No or Sy metaslab_df_alloc_threshold ) ,
 this tunable controls which segment is used.
 If set, we will use the largest free segment.
 If unset, we will use a segment of at least the requested size.
 .
 .It Sy zfs_metaslab_max_size_cache_sec Ns = Ns Sy 3600 Ns s Po 1 hour Pc Pq u64
 When we unload a metaslab, we cache the size of the largest free chunk.
 We use that cached size to determine whether or not to load a metaslab
 for a given allocation.
 As more frees accumulate in that metaslab while it's unloaded,
 the cached max size becomes less and less accurate.
 After a number of seconds controlled by this tunable,
 we stop considering the cached max size and start
 considering only the histogram instead.
 .
 .It Sy zfs_metaslab_mem_limit Ns = Ns Sy 25 Ns % Pq uint
 When we are loading a new metaslab, we check the amount of memory being used
 to store metaslab range trees.
 If it is over a threshold, we attempt to unload the least recently used metaslab
 to prevent the system from clogging all of its memory with range trees.
 This tunable sets the percentage of total system memory that is the threshold.
 .
 .It Sy zfs_metaslab_try_hard_before_gang Ns = Ns Sy 0 Ns | Ns 1 Pq int
 .Bl -item -compact
 .It
 If unset, we will first try normal allocation.
 .It
 If that fails then we will do a gang allocation.
 .It
 If that fails then we will do a "try hard" gang allocation.
 .It
 If that fails then we will have a multi-layer gang block.
 .El
 .Pp
 .Bl -item -compact
 .It
 If set, we will first try normal allocation.
 .It
 If that fails then we will do a "try hard" allocation.
 .It
 If that fails we will do a gang allocation.
 .It
 If that fails we will do a "try hard" gang allocation.
 .It
 If that fails then we will have a multi-layer gang block.
 .El
 .
 .It Sy zfs_metaslab_find_max_tries Ns = Ns Sy 100 Pq uint
 When not trying hard, we only consider this number of the best metaslabs.
 This improves performance, especially when there are many metaslabs per vdev
 and the allocation can't actually be satisfied
 (so we would otherwise iterate all metaslabs).
 .
 .It Sy zfs_vdev_default_ms_count Ns = Ns Sy 200 Pq uint
 When a vdev is added, target this number of metaslabs per top-level vdev.
 .
 .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq uint
 Default lower limit for metaslab size.
 .
 .It Sy zfs_vdev_max_ms_shift Ns = Ns Sy 34 Po 16 GiB Pc Pq uint
 Default upper limit for metaslab size.
 .
 .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq uint
 Maximum ashift used when optimizing for logical \[->] physical sector size on
 new
 top-level vdevs.
 May be increased up to
 .Sy ASHIFT_MAX Po 16 Pc ,
 but this may negatively impact pool space efficiency.
 .
 .It Sy zfs_vdev_direct_write_verify Ns = Ns Sy Linux 1 | FreeBSD 0 Pq uint
 If non-zero, then a Direct I/O write's checksum will be verified every
 time the write is issued and before it is committed to the block pointer.
 In the event the checksum is not valid then the I/O operation will return EIO.
 This module parameter can be used to detect if the
 contents of the users buffer have changed in the process of doing a Direct I/O
 write.
 It can also help to identify if reported checksum errors are tied to Direct I/O
 writes.
 Each verify error causes a
 .Sy dio_verify_wr
 zevent.
 Direct Write I/O checksum verify errors can be seen with
 .Nm zpool Cm status Fl d .
 The default value for this is 1 on Linux, but is 0 for
 .Fx
 because user pages can be placed under write protection in
 .Fx
 before the Direct I/O write is issued.
 .
 .It Sy zfs_vdev_min_auto_ashift Ns = Ns Sy ASHIFT_MIN Po 9 Pc Pq uint
 Minimum ashift used when creating new top-level vdevs.
 .
 .It Sy zfs_vdev_min_ms_count Ns = Ns Sy 16 Pq uint
 Minimum number of metaslabs to create in a top-level vdev.
 .
 .It Sy vdev_validate_skip Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Skip label validation steps during pool import.
 Changing is not recommended unless you know what you're doing
 and are recovering a damaged label.
 .
 .It Sy zfs_vdev_ms_count_limit Ns = Ns Sy 131072 Po 128k Pc Pq uint
 Practical upper limit of total metaslabs per top-level vdev.
 .
 .It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable metaslab group preloading.
 .
 .It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint
 Maximum number of metaslabs per group to preload
 .
 .It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint
 Percentage of CPUs to run a metaslab preload taskq
 .
 .It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Give more weight to metaslabs with lower LBAs,
 assuming they have greater bandwidth,
 as is typically the case on a modern constant angular velocity disk drive.
 .
 .It Sy metaslab_unload_delay Ns = Ns Sy 32 Pq uint
 After a metaslab is used, we keep it loaded for this many TXGs, to attempt to
 reduce unnecessary reloading.
 Note that both this many TXGs and
 .Sy metaslab_unload_delay_ms
 milliseconds must pass before unloading will occur.
 .
 .It Sy metaslab_unload_delay_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq uint
 After a metaslab is used, we keep it loaded for this many milliseconds,
 to attempt to reduce unnecessary reloading.
 Note, that both this many milliseconds and
 .Sy metaslab_unload_delay
 TXGs must pass before unloading will occur.
 .
 .It Sy reference_history Ns = Ns Sy 3 Pq uint
 Maximum reference holders being tracked when reference_tracking_enable is
 active.
 .It Sy raidz_expand_max_copy_bytes Ns = Ns Sy 160MB Pq ulong
 Max amount of memory to use for RAID-Z expansion I/O.
 This limits how much I/O can be outstanding at once.
 .
 .It Sy raidz_expand_max_reflow_bytes Ns = Ns Sy 0 Pq ulong
 For testing, pause RAID-Z expansion when reflow amount reaches this value.
 .
 .It Sy raidz_io_aggregate_rows Ns = Ns Sy 4 Pq ulong
 For expanded RAID-Z, aggregate reads that have more rows than this.
 .
 .It Sy reference_history Ns = Ns Sy 3 Pq int
 Maximum reference holders being tracked when reference_tracking_enable is
 active.
 .
 .It Sy reference_tracking_enable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Track reference holders to
 .Sy refcount_t
 objects (debug builds only).
 .
 .It Sy send_holes_without_birth_time Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When set, the
 .Sy hole_birth
 optimization will not be used, and all holes will always be sent during a
 .Nm zfs Cm send .
 This is useful if you suspect your datasets are affected by a bug in
 .Sy hole_birth .
 .
 .It Sy spa_config_path Ns = Ns Pa /etc/zfs/zpool.cache Pq charp
 SPA config file.
 .
 .It Sy spa_asize_inflation Ns = Ns Sy 24 Pq uint
 Multiplication factor used to estimate actual disk consumption from the
 size of data being written.
 The default value is a worst case estimate,
 but lower values may be valid for a given pool depending on its configuration.
 Pool administrators who understand the factors involved
 may wish to specify a more realistic inflation factor,
 particularly if they operate close to quota or capacity limits.
 .
 .It Sy spa_load_print_vdev_tree Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Whether to print the vdev tree in the debugging message buffer during pool
 import.
 .
 .It Sy spa_load_verify_data Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Whether to traverse data blocks during an "extreme rewind"
 .Pq Fl X
 import.
 .Pp
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.
 If this parameter is unset, the traversal skips non-metadata blocks.
 It can be toggled once the
 import has started to stop or start the traversal of non-metadata blocks.
 .
 .It Sy spa_load_verify_metadata  Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Whether to traverse blocks during an "extreme rewind"
 .Pq Fl X
 pool import.
 .Pp
 An extreme rewind import normally performs a full traversal of all
 blocks in the pool for verification.
 If this parameter is unset, the traversal is not performed.
 It can be toggled once the import has started to stop or start the traversal.
 .
 .It Sy spa_load_verify_shift Ns = Ns Sy 4 Po 1/16th Pc Pq uint
 Sets the maximum number of bytes to consume during pool import to the log2
 fraction of the target ARC size.
 .
 .It Sy spa_slop_shift Ns = Ns Sy 5 Po 1/32nd Pc Pq int
 Normally, we don't allow the last
 .Sy 3.2% Pq Sy 1/2^spa_slop_shift
 of space in the pool to be consumed.
 This ensures that we don't run the pool completely out of space,
 due to unaccounted changes (e.g. to the MOS).
 It also limits the worst-case time to allocate space.
 If we have less than this amount of free space,
 most ZPL operations (e.g. write, create) will return
 .Sy ENOSPC .
 .
 .It Sy spa_num_allocators Ns = Ns Sy 4 Pq int
 Determines the number of block allocators to use per spa instance.
 Capped by the number of actual CPUs in the system via
 .Sy spa_cpus_per_allocator .
 .Pp
 Note that setting this value too high could result in performance
 degradation and/or excess fragmentation.
 Set value only applies to pools imported/created after that.
 .
 .It Sy spa_cpus_per_allocator Ns = Ns Sy 4 Pq int
 Determines the minimum number of CPUs in a system for block allocator
 per spa instance.
 Set value only applies to pools imported/created after that.
 .
 .It Sy spa_upgrade_errlog_limit Ns = Ns Sy 0 Pq uint
 Limits the number of on-disk error log entries that will be converted to the
 new format when enabling the
 .Sy head_errlog
 feature.
 The default is to convert all log entries.
 .
 .It Sy vdev_removal_max_span Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 During top-level vdev removal, chunks of data are copied from the vdev
 which may include free space in order to trade bandwidth for IOPS.
 This parameter determines the maximum span of free space, in bytes,
 which will be included as "unnecessary" data in a chunk of copied data.
 .Pp
 The default value here was chosen to align with
 .Sy zfs_vdev_read_gap_limit ,
 which is a similar concept when doing
 regular reads (but there's no reason it has to be the same).
 .
 .It Sy vdev_file_logical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64
 Logical ashift for file-based devices.
 .
 .It Sy vdev_file_physical_ashift Ns = Ns Sy 9 Po 512 B Pc Pq u64
 Physical ashift for file-based devices.
 .
 .It Sy zap_iterate_prefetch Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If set, when we start iterating over a ZAP object,
 prefetch the entire object (all leaf blocks).
 However, this is limited by
 .Sy dmu_prefetch_max .
 .
 .It Sy zap_micro_max_size Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq int
 Maximum micro ZAP size.
 A "micro" ZAP is upgraded to a "fat" ZAP once it grows beyond the specified
 size.
 Sizes higher than 128KiB will be clamped to 128KiB unless the
 .Sy large_microzap
 feature is enabled.
 .
 .It Sy zap_shrink_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If set, adjacent empty ZAP blocks will be collapsed, reducing disk space.
 .
 .It Sy zfetch_min_distance Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Min bytes to prefetch per stream.
 Prefetch distance starts from the demand access size and quickly grows to
 this value, doubling on each hit.
 After that it may grow further by 1/8 per hit, but only if some prefetch
 since last time haven't completed in time to satisfy demand request, i.e.
 prefetch depth didn't cover the read latency or the pool got saturated.
 .
 .It Sy zfetch_max_distance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch per stream.
 .
 .It Sy zfetch_max_idistance Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq uint
 Max bytes to prefetch indirects for per stream.
 .
 .It Sy zfetch_max_reorder Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 Requests within this byte distance from the current prefetch stream position
 are considered parts of the stream, reordered due to parallel processing.
 Such requests do not advance the stream position immediately unless
 .Sy zfetch_hole_shift
 fill threshold is reached, but saved to fill holes in the stream later.
 .
 .It Sy zfetch_max_streams Ns = Ns Sy 8 Pq uint
 Max number of streams per zfetch (prefetch streams per file).
 .
 .It Sy zfetch_min_sec_reap Ns = Ns Sy 1 Pq uint
 Min time before inactive prefetch stream can be reclaimed
 .
 .It Sy zfetch_max_sec_reap Ns = Ns Sy 2 Pq uint
 Max time before inactive prefetch stream can be deleted
 .
 .It Sy zfs_abd_scatter_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enables ARC from using scatter/gather lists and forces all allocations to be
 linear in kernel memory.
 Disabling can improve performance in some code paths
 at the expense of fragmented kernel memory.
 .
 .It Sy zfs_abd_scatter_max_order Ns = Ns Sy MAX_ORDER\-1 Pq uint
 Maximum number of consecutive memory pages allocated in a single block for
 scatter/gather lists.
 .Pp
 The value of
 .Sy MAX_ORDER
 depends on kernel configuration.
 .
 .It Sy zfs_abd_scatter_min_size Ns = Ns Sy 1536 Ns B Po 1.5 KiB Pc Pq uint
 This is the minimum allocation size that will use scatter (page-based) ABDs.
 Smaller allocations will use linear ABDs.
 .
 .It Sy zfs_arc_dnode_limit Ns = Ns Sy 0 Ns B Pq u64
 When the number of bytes consumed by dnodes in the ARC exceeds this number of
 bytes, try to unpin some of it in response to demand for non-metadata.
 This value acts as a ceiling to the amount of dnode metadata, and defaults to
 .Sy 0 ,
 which indicates that a percent which is based on
 .Sy zfs_arc_dnode_limit_percent
 of the ARC meta buffers that may be used for dnodes.
 .It Sy zfs_arc_dnode_limit_percent Ns = Ns Sy 10 Ns % Pq u64
 Percentage that can be consumed by dnodes of ARC meta buffers.
 .Pp
 See also
 .Sy zfs_arc_dnode_limit ,
 which serves a similar purpose but has a higher priority if nonzero.
 .
 .It Sy zfs_arc_dnode_reduce_percent Ns = Ns Sy 10 Ns % Pq u64
 Percentage of ARC dnodes to try to scan in response to demand for non-metadata
 when the number of bytes consumed by dnodes exceeds
 .Sy zfs_arc_dnode_limit .
 .
 .It Sy zfs_arc_average_blocksize Ns = Ns Sy 8192 Ns B Po 8 KiB Pc Pq uint
 The ARC's buffer hash table is sized based on the assumption of an average
 block size of this value.
 This works out to roughly 1 MiB of hash table per 1 GiB of physical memory
 with 8-byte pointers.
 For configurations with a known larger average block size,
 this value can be increased to reduce the memory footprint.
 .
 .It Sy zfs_arc_eviction_pct Ns = Ns Sy 200 Ns % Pq uint
 When
 .Fn arc_is_overflowing ,
 .Fn arc_get_data_impl
 waits for this percent of the requested amount of data to be evicted.
 For example, by default, for every
 .Em 2 KiB
 that's evicted,
 .Em 1 KiB
 of it may be "reused" by a new allocation.
 Since this is above
 .Sy 100 Ns % ,
 it ensures that progress is made towards getting
 .Sy arc_size No under Sy arc_c .
 Since this is finite, it ensures that allocations can still happen,
 even during the potentially long time that
 .Sy arc_size No is more than Sy arc_c .
 .
 .It Sy zfs_arc_evict_batch_limit Ns = Ns Sy 10 Pq uint
 Number ARC headers to evict per sub-list before proceeding to another sub-list.
 This batch-style operation prevents entire sub-lists from being evicted at once
 but comes at a cost of additional unlocking and locking.
 .
 .It Sy zfs_arc_evict_threads Ns = Ns Sy 0 Pq int
 Sets the number of ARC eviction threads to be used.
 .Pp
 If set greater than 0, ZFS will dedicate up to that many threads to ARC
 eviction.
 Each thread will process one sub-list at a time,
 until the eviction target is reached or all sub-lists have been processed.
 When set to 0, ZFS will compute a reasonable number of eviction threads based
 on the number of CPUs.
 .TS
 box;
 lb l l .
 	CPUs	Threads
 _
 	1-4	1
 	5-8	2
 	9-15	3
 	16-31	4
 	32-63	6
 	64-95	8
 	96-127	9
 	128-160	11
 	160-191	12
 	192-223	13
 	224-255	14
 	256+	16
 .TE
 .Pp
 More threads may improve the responsiveness of ZFS to memory pressure.
 This can be important for performance when eviction from the ARC becomes
 a bottleneck for reads and writes.
 .Pp
 This parameter can only be set at module load time.
 .
 .It Sy zfs_arc_grow_retry Ns = Ns Sy 0 Ns s Pq uint
 If set to a non zero value, it will replace the
 .Sy arc_grow_retry
 value with this value.
 The
 .Sy arc_grow_retry
 .No value Pq default Sy 5 Ns s
 is the number of seconds the ARC will wait before
 trying to resume growth after a memory pressure event.
 .
 .It Sy zfs_arc_lotsfree_percent Ns = Ns Sy 10 Ns % Pq int
 Throttle I/O when free system memory drops below this percentage of total
 system memory.
 Setting this value to
 .Sy 0
 will disable the throttle.
 .
 .It Sy zfs_arc_max Ns = Ns Sy 0 Ns B Pq u64
 Max size of ARC in bytes.
 If
 .Sy 0 ,
 then the max size of ARC is determined by the amount of system memory installed.
 The larger of
 .Sy all_system_memory No \- Sy 1 GiB
 and
 .Sy 5/8 No \(mu Sy all_system_memory
 will be used as the limit.
 This value must be at least
 .Sy 67108864 Ns B Pq 64 MiB .
 .Pp
 This value can be changed dynamically, with some caveats.
 It cannot be set back to
 .Sy 0
 while running, and reducing it below the current ARC size will not cause
 the ARC to shrink without memory pressure to induce shrinking.
 .
 .It Sy zfs_arc_meta_balance Ns = Ns Sy 500 Pq uint
 Balance between metadata and data on ghost hits.
 Values above 100 increase metadata caching by proportionally reducing effect
 of ghost data hits on target data/metadata rate.
 .
 .It Sy zfs_arc_min Ns = Ns Sy 0 Ns B Pq u64
 Min size of ARC in bytes.
 .No If set to Sy 0 , arc_c_min
 will default to consuming the larger of
 .Sy 32 MiB
 and
 .Sy all_system_memory No / Sy 32 .
 .
 .It Sy zfs_arc_min_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 1s Pc Pq uint
 Minimum time prefetched blocks are locked in the ARC.
 .
 .It Sy zfs_arc_min_prescient_prefetch_ms Ns = Ns Sy 0 Ns ms Ns Po Ns ≡ Ns 6s Pc Pq uint
 Minimum time "prescient prefetched" blocks are locked in the ARC.
 These blocks are meant to be prefetched fairly aggressively ahead of
 the code that may use them.
 .
 .It Sy zfs_arc_prune_task_threads Ns = Ns Sy 1 Pq int
 Number of arc_prune threads.
 .Fx
 does not need more than one.
 Linux may theoretically use one per mount point up to number of CPUs,
 but that was not proven to be useful.
 .
 .It Sy zfs_max_missing_tvds Ns = Ns Sy 0 Pq int
 Number of missing top-level vdevs which will be allowed during
 pool import (only in read-only mode).
 .
 .It Sy zfs_max_nvlist_src_size Ns = Sy 0 Pq u64
 Maximum size in bytes allowed to be passed as
 .Sy zc_nvlist_src_size
 for ioctls on
 .Pa /dev/zfs .
 This prevents a user from causing the kernel to allocate
 an excessive amount of memory.
 When the limit is exceeded, the ioctl fails with
 .Sy EINVAL
 and a description of the error is sent to the
 .Pa zfs-dbgmsg
 log.
 This parameter should not need to be touched under normal circumstances.
 If
 .Sy 0 ,
 equivalent to a quarter of the user-wired memory limit under
 .Fx
 and to
 .Sy 134217728 Ns B Pq 128 MiB
 under Linux.
 .
 .It Sy zfs_multilist_num_sublists Ns = Ns Sy 0 Pq uint
 To allow more fine-grained locking, each ARC state contains a series
 of lists for both data and metadata objects.
 Locking is performed at the level of these "sub-lists".
 This parameters controls the number of sub-lists per ARC state,
 and also applies to other uses of the multilist data structure.
 .Pp
 If
 .Sy 0 ,
 equivalent to the greater of the number of online CPUs and
 .Sy 4 .
 .
 .It Sy zfs_arc_overflow_shift Ns = Ns Sy 8 Pq int
 The ARC size is considered to be overflowing if it exceeds the current
 ARC target size
 .Pq Sy arc_c
 by thresholds determined by this parameter.
 Exceeding by
 .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No / Sy 2
 starts ARC reclamation process.
 If that appears insufficient, exceeding by
 .Sy ( arc_c No >> Sy zfs_arc_overflow_shift ) No \(mu Sy 1.5
 blocks new buffer allocation until the reclaim thread catches up.
 Started reclamation process continues till ARC size returns below the
 target size.
 .Pp
 The default value of
 .Sy 8
 causes the ARC to start reclamation if it exceeds the target size by
 .Em 0.2%
 of the target size, and block allocations by
 .Em 0.6% .
 .
 .It Sy zfs_arc_shrink_shift Ns = Ns Sy 0 Pq uint
 If nonzero, this will update
 .Sy arc_shrink_shift Pq default Sy 7
 with the new value.
 .
 .It Sy zfs_arc_pc_percent Ns = Ns Sy 0 Ns % Po off Pc Pq uint
 Percent of pagecache to reclaim ARC to.
 .Pp
 This tunable allows the ZFS ARC to play more nicely
 with the kernel's LRU pagecache.
 It can guarantee that the ARC size won't collapse under scanning
 pressure on the pagecache, yet still allows the ARC to be reclaimed down to
 .Sy zfs_arc_min
 if necessary.
 This value is specified as percent of pagecache size (as measured by
 .Sy NR_ACTIVE_FILE
 +
 .Sy NR_INACTIVE_FILE ) ,
 where that percent may exceed
 .Sy 100 .
 This
 only operates during memory pressure/reclaim.
 .
 .It Sy zfs_arc_shrinker_limit Ns = Ns Sy 0 Pq int
 This is a limit on how many pages the ARC shrinker makes available for
 eviction in response to one page allocation attempt.
 Note that in practice, the kernel's shrinker can ask us to evict
 up to about four times this for one allocation attempt.
 To reduce OOM risk, this limit is applied for kswapd reclaims only.
 .Pp
 For example a value of
 .Sy 10000 Pq in practice, Em 160 MiB No per allocation attempt with 4 KiB pages
 limits the amount of time spent attempting to reclaim ARC memory to
 less than 100 ms per allocation attempt,
 even with a small average compressed block size of ~8 KiB.
 .Pp
 The parameter can be set to 0 (zero) to disable the limit,
 and only applies on Linux.
 .
 .It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int
 Relative cost of ARC eviction on Linux, AKA number of seeks needed to
 restore evicted page.
 Bigger values make ARC more precious and evictions smaller, comparing to
 other kernel subsystems.
 Value of 4 means parity with page cache.
 .
 .It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64
 The target number of bytes the ARC should leave as free memory on the system.
 If zero, equivalent to the bigger of
 .Sy 512 KiB No and Sy all_system_memory/64 .
 .
 .It Sy zfs_checksum_events_per_second Ns = Ns Sy 20 Ns /s Pq uint
 Rate limit checksum events to this many per second.
 Note that this should not be set below the ZED thresholds
 (currently 10 checksums over 10 seconds)
 or else the daemon may not trigger any action.
 .
 .It Sy zfs_commit_timeout_pct Ns = Ns Sy 10 Ns % Pq uint
 This controls the amount of time that a ZIL block (lwb) will remain "open"
 when it isn't "full", and it has a thread waiting for it to be committed to
 stable storage.
 The timeout is scaled based on a percentage of the last lwb
 latency to avoid significantly impacting the latency of each individual
 transaction record (itx).
 .
 .It Sy zfs_condense_indirect_commit_entry_delay_ms Ns = Ns Sy 0 Ns ms Pq int
 Vdev indirection layer (used for device removal) sleeps for this many
 milliseconds during mapping generation.
 Intended for use with the test suite to throttle vdev removal speed.
 .
 .It Sy zfs_condense_indirect_obsolete_pct Ns = Ns Sy 25 Ns % Pq uint
 Minimum percent of obsolete bytes in vdev mapping required to attempt to
 condense
 .Pq see Sy zfs_condense_indirect_vdevs_enable .
 Intended for use with the test suite
 to facilitate triggering condensing as needed.
 .
 .It Sy zfs_condense_indirect_vdevs_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable condensing indirect vdev mappings.
 When set, attempt to condense indirect vdev mappings
 if the mapping uses more than
 .Sy zfs_condense_min_mapping_bytes
 bytes of memory and if the obsolete space map object uses more than
 .Sy zfs_condense_max_obsolete_bytes
 bytes on-disk.
 The condensing process is an attempt to save memory by removing obsolete
 mappings.
 .
 .It Sy zfs_condense_max_obsolete_bytes Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
 Only attempt to condense indirect vdev mappings if the on-disk size
 of the obsolete space map object is greater than this number of bytes
 .Pq see Sy zfs_condense_indirect_vdevs_enable .
 .
 .It Sy zfs_condense_min_mapping_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq u64
 Minimum size vdev mapping to attempt to condense
 .Pq see Sy zfs_condense_indirect_vdevs_enable .
 .
 .It Sy zfs_dbgmsg_enable Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Internally ZFS keeps a small log to facilitate debugging.
 The log is enabled by default, and can be disabled by unsetting this option.
 The contents of the log can be accessed by reading
 .Pa /proc/spl/kstat/zfs/dbgmsg .
 Writing
 .Sy 0
 to the file clears the log.
 .Pp
 This setting does not influence debug prints due to
 .Sy zfs_flags .
 .
 .It Sy zfs_dbgmsg_maxsize Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq uint
 Maximum size of the internal ZFS debug log.
 .
 .It Sy zfs_dbuf_state_index Ns = Ns Sy 0 Pq int
 Historically used for controlling what reporting was available under
 .Pa /proc/spl/kstat/zfs .
 No effect.
 .
 .It Sy zfs_deadman_checktime_ms Ns = Ns Sy 60000 Ns ms Po 1 min Pc Pq u64
 Check time in milliseconds.
 This defines the frequency at which we check for hung I/O requests
 and potentially invoke the
 .Sy zfs_deadman_failmode
 behavior.
 .
 .It Sy zfs_deadman_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When a pool sync operation takes longer than
 .Sy zfs_deadman_synctime_ms ,
 or when an individual I/O operation takes longer than
 .Sy zfs_deadman_ziotime_ms ,
 then the operation is considered to be "hung".
 If
 .Sy zfs_deadman_enabled
 is set, then the deadman behavior is invoked as described by
 .Sy zfs_deadman_failmode .
 By default, the deadman is enabled and set to
 .Sy wait
 which results in "hung" I/O operations only being logged.
 The deadman is automatically disabled when a pool gets suspended.
 .
 .It Sy zfs_deadman_events_per_second Ns = Ns Sy 1 Ns /s Pq int
 Rate limit deadman zevents (which report hung I/O operations) to this many per
 second.
 .
 .It Sy zfs_deadman_failmode Ns = Ns Sy wait Pq charp
 Controls the failure behavior when the deadman detects a "hung" I/O operation.
 Valid values are:
 .Bl -tag -compact -offset 4n -width "continue"
 .It Sy wait
 Wait for a "hung" operation to complete.
 For each "hung" operation a "deadman" event will be posted
 describing that operation.
 .It Sy continue
 Attempt to recover from a "hung" operation by re-dispatching it
 to the I/O pipeline if possible.
 .It Sy panic
 Panic the system.
 This can be used to facilitate automatic fail-over
 to a properly configured fail-over partner.
 .El
 .
 .It Sy zfs_deadman_synctime_ms Ns = Ns Sy 600000 Ns ms Po 10 min Pc Pq u64
 Interval in milliseconds after which the deadman is triggered and also
 the interval after which a pool sync operation is considered to be "hung".
 Once this limit is exceeded the deadman will be invoked every
 .Sy zfs_deadman_checktime_ms
 milliseconds until the pool sync completes.
 .
 .It Sy zfs_deadman_ziotime_ms Ns = Ns Sy 300000 Ns ms Po 5 min Pc Pq u64
 Interval in milliseconds after which the deadman is triggered and an
 individual I/O operation is considered to be "hung".
 As long as the operation remains "hung",
 the deadman will be invoked every
 .Sy zfs_deadman_checktime_ms
 milliseconds until the operation completes.
 .
 .It Sy zfs_dedup_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enable prefetching dedup-ed blocks which are going to be freed.
 .
 .It Sy zfs_dedup_log_flush_min_time_ms Ns = Ns Sy 1000 Ns Pq uint
 Minimum time to spend on dedup log flush each transaction.
 .Pp
 At least this long will be spent flushing dedup log entries each transaction,
 up to
 .Sy zfs_txg_timeout .
 This occurs even if doing so would delay the transaction, that is, other IO
 completes under this time.
 .
 .It Sy zfs_dedup_log_flush_entries_min Ns = Ns Sy 100 Ns Pq uint
 Flush at least this many entries each transaction.
 .Pp
 OpenZFS will flush a fraction of the log every TXG, to keep the size
 proportional to the ingest rate (see
 .Sy zfs_dedup_log_flush_txgs ) .
 This sets the minimum for that estimate, which prevents the backlog from
 completely draining if the ingest rate falls.
 Raising it can force OpenZFS to flush more aggressively, reducing the backlog
 to zero more quickly, but can make it less able to back off if log
 flushing would compete with other IO too much.
 .
 .It Sy zfs_dedup_log_flush_entries_max Ns = Ns Sy UINT_MAX Ns Pq uint
 Flush at most this many entries each transaction.
 .Pp
 Mostly used for debugging purposes.
 .It Sy zfs_dedup_log_flush_txgs Ns = Ns Sy 100 Ns Pq uint
 Target number of TXGs to process the whole dedup log.
 .Pp
 Every TXG, OpenZFS will process the inverse of this number times the size
 of the DDT backlog.
 This will keep the backlog at a size roughly equal to the ingest rate
 times this value.
 This offers a balance between a more efficient DDT log, with better
 aggregation, and shorter import times, which increase as the size of the
 DDT log increases.
 Increasing this value will result in a more efficient DDT log, but longer
 import times.
 .It Sy zfs_dedup_log_cap Ns = Ns Sy UINT_MAX Ns Pq uint
 Soft cap for the size of the current dedup log.
 .Pp
 If the log is larger than this size, we increase the aggressiveness of
 the flushing to try to bring it back down to the soft cap.
 Setting it will reduce import times, but will reduce the efficiency of
 the DDT log, increasing the expected number of IOs required to flush the same
 amount of data.
 .It Sy zfs_dedup_log_hard_cap Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Whether to treat the log cap as a firm cap or not.
 .Pp
 When set to 0 (the default), the
 .Sy zfs_dedup_log_cap
 will increase the maximum number of log entries we flush in a given txg.
 This will bring the backlog size down towards the cap, but not at the expense
 of making TXG syncs take longer.
 If this is set to 1, the cap acts more like a hard cap than a soft cap; it will
 also increase the minimum number of log entries we flush per TXG.
 Enabling it will reduce worst-case import times, at the cost of increased TXG
 sync times.
 .It Sy zfs_dedup_log_flush_flow_rate_txgs Ns = Ns Sy 10 Ns Pq uint
 Number of transactions to use to compute the flow rate.
 .Pp
 OpenZFS will estimate number of entries changed (ingest rate), number of entries
 flushed (flush rate) and time spent flushing (flush time rate) and combining
 these into an overall "flow rate".
 It will use an exponential weighted moving average over some number of recent
 transactions to compute these rates.
 This sets the number of transactions to compute these averages over.
 Setting it higher can help to smooth out the flow rate in the face of spiky
 workloads, but will take longer for the flow rate to adjust to a sustained
 change in the ingress rate.
 .
 .It Sy zfs_dedup_log_txg_max Ns = Ns Sy 8 Ns Pq uint
 Max transactions to before starting to flush dedup logs.
 .Pp
 OpenZFS maintains two dedup logs, one receiving new changes, one flushing.
 If there is nothing to flush, it will accumulate changes for no more than this
 many transactions before switching the logs and starting to flush entries out.
 .
 .It Sy zfs_dedup_log_mem_max Ns = Ns Sy 0 Ns Pq u64
 Max memory to use for dedup logs.
 .Pp
 OpenZFS will spend no more than this much memory on maintaining the in-memory
 dedup log.
 Flushing will begin when around half this amount is being spent on logs.
 The default value of
 .Sy 0
 will cause it to be set by
 .Sy zfs_dedup_log_mem_max_percent
 instead.
 .
 .It Sy zfs_dedup_log_mem_max_percent Ns = Ns Sy 1 Ns % Pq uint
 Max memory to use for dedup logs, as a percentage of total memory.
 .Pp
 If
 .Sy zfs_dedup_log_mem_max
 is not set, it will be initialized as a percentage of the total memory in the
 system.
 .
 .It Sy zfs_delay_min_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 Start to delay each transaction once there is this amount of dirty data,
 expressed as a percentage of
 .Sy zfs_dirty_data_max .
 This value should be at least
 .Sy zfs_vdev_async_write_active_max_dirty_percent .
 .No See Sx ZFS TRANSACTION DELAY .
 .
 .It Sy zfs_delay_scale Ns = Ns Sy 500000 Pq int
 This controls how quickly the transaction delay approaches infinity.
 Larger values cause longer delays for a given amount of dirty data.
 .Pp
 For the smoothest delay, this value should be about 1 billion divided
 by the maximum number of operations per second.
 This will smoothly handle between ten times and a tenth of this number.
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 .Sy zfs_delay_scale No \(mu Sy zfs_dirty_data_max Em must No be smaller than Sy 2^64 .
 .
 .It Sy zfs_dio_write_verify_events_per_second Ns = Ns Sy 20 Ns /s Pq uint
 Rate limit Direct I/O write verify events to this many per second.
 .
 .It Sy zfs_disable_ivset_guid_check Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disables requirement for IVset GUIDs to be present and match when doing a raw
 receive of encrypted datasets.
 Intended for users whose pools were created with
 OpenZFS pre-release versions and now have compatibility issues.
 .
 .It Sy zfs_key_max_salt_uses Ns = Ns Sy 400000000 Po 4*10^8 Pc Pq ulong
 Maximum number of uses of a single salt value before generating a new one for
 encrypted datasets.
 The default value is also the maximum.
 .
 .It Sy zfs_object_mutex_size Ns = Ns Sy 64 Pq uint
 Size of the znode hashtable used for holds.
 .Pp
 Due to the need to hold locks on objects that may not exist yet, kernel mutexes
 are not created per-object and instead a hashtable is used where collisions
 will result in objects waiting when there is not actually contention on the
 same object.
 .
 .It Sy zfs_slow_io_events_per_second Ns = Ns Sy 20 Ns /s Pq int
 Rate limit delay zevents (which report slow I/O operations) to this many per
 second.
 .
 .It Sy zfs_unflushed_max_mem_amt Ns = Ns Sy 1073741824 Ns B Po 1 GiB Pc Pq u64
 Upper-bound limit for unflushed metadata changes to be held by the
 log spacemap in memory, in bytes.
 .
 .It Sy zfs_unflushed_max_mem_ppm Ns = Ns Sy 1000 Ns ppm Po 0.1% Pc Pq u64
 Part of overall system memory that ZFS allows to be used
 for unflushed metadata changes by the log spacemap, in millionths.
 .
 .It Sy zfs_unflushed_log_block_max Ns = Ns Sy 131072 Po 128k Pc Pq u64
 Describes the maximum number of log spacemap blocks allowed for each pool.
 The default value means that the space in all the log spacemaps
 can add up to no more than
 .Sy 131072
 blocks (which means
 .Em 16 GiB
 of logical space before compression and ditto blocks,
 assuming that blocksize is
 .Em 128 KiB ) .
 .Pp
 This tunable is important because it involves a trade-off between import
 time after an unclean export and the frequency of flushing metaslabs.
 The higher this number is, the more log blocks we allow when the pool is
 active which means that we flush metaslabs less often and thus decrease
 the number of I/O operations for spacemap updates per TXG.
 At the same time though, that means that in the event of an unclean export,
 there will be more log spacemap blocks for us to read, inducing overhead
 in the import time of the pool.
 The lower the number, the amount of flushing increases, destroying log
 blocks quicker as they become obsolete faster, which leaves less blocks
 to be read during import time after a crash.
 .Pp
 Each log spacemap block existing during pool import leads to approximately
 one extra logical I/O issued.
 This is the reason why this tunable is exposed in terms of blocks rather
 than space used.
 .
 .It Sy zfs_unflushed_log_block_min Ns = Ns Sy 1000 Pq u64
 If the number of metaslabs is small and our incoming rate is high,
 we could get into a situation that we are flushing all our metaslabs every TXG.
 Thus we always allow at least this many log blocks.
 .
 .It Sy zfs_unflushed_log_block_pct Ns = Ns Sy 400 Ns % Pq u64
 Tunable used to determine the number of blocks that can be used for
 the spacemap log, expressed as a percentage of the total number of
 unflushed metaslabs in the pool.
 .
 .It Sy zfs_unflushed_log_txg_max Ns = Ns Sy 1000 Pq u64
 Tunable limiting maximum time in TXGs any metaslab may remain unflushed.
 It effectively limits maximum number of unflushed per-TXG spacemap logs
 that need to be read after unclean pool export.
 .
 .It Sy zfs_unlink_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When enabled, files will not be asynchronously removed from the list of pending
 unlinks and the space they consume will be leaked.
 Once this option has been disabled and the dataset is remounted,
 the pending unlinks will be processed and the freed space returned to the pool.
 This option is used by the test suite.
 .
 .It Sy zfs_delete_blocks Ns = Ns Sy 20480 Pq ulong
 This is the used to define a large file for the purposes of deletion.
 Files containing more than
 .Sy zfs_delete_blocks
 will be deleted asynchronously, while smaller files are deleted synchronously.
 Decreasing this value will reduce the time spent in an
 .Xr unlink 2
 system call, at the expense of a longer delay before the freed space is
 available.
 This only applies on Linux.
 .
 .It Sy zfs_dirty_data_max Ns = Pq int
 Determines the dirty space limit in bytes.
 Once this limit is exceeded, new writes are halted until space frees up.
 This parameter takes precedence over
 .Sy zfs_dirty_data_max_percent .
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 Defaults to
 .Sy physical_ram/10 ,
 capped at
 .Sy zfs_dirty_data_max_max .
 .
 .It Sy zfs_dirty_data_max_max Ns = Pq int
 Maximum allowable value of
 .Sy zfs_dirty_data_max ,
 expressed in bytes.
 This limit is only enforced at module load time, and will be ignored if
 .Sy zfs_dirty_data_max
 is later changed.
 This parameter takes precedence over
 .Sy zfs_dirty_data_max_max_percent .
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 Defaults to
 .Sy min(physical_ram/4, 4GiB) ,
 or
 .Sy min(physical_ram/4, 1GiB)
 for 32-bit systems.
 .
 .It Sy zfs_dirty_data_max_max_percent Ns = Ns Sy 25 Ns % Pq uint
 Maximum allowable value of
 .Sy zfs_dirty_data_max ,
 expressed as a percentage of physical RAM.
 This limit is only enforced at module load time, and will be ignored if
 .Sy zfs_dirty_data_max
 is later changed.
 The parameter
 .Sy zfs_dirty_data_max_max
 takes precedence over this one.
 .No See Sx ZFS TRANSACTION DELAY .
 .
 .It Sy zfs_dirty_data_max_percent Ns = Ns Sy 10 Ns % Pq uint
 Determines the dirty space limit, expressed as a percentage of all memory.
 Once this limit is exceeded, new writes are halted until space frees up.
 The parameter
 .Sy zfs_dirty_data_max
 takes precedence over this one.
 .No See Sx ZFS TRANSACTION DELAY .
 .Pp
 Subject to
 .Sy zfs_dirty_data_max_max .
 .
 .It Sy zfs_dirty_data_sync_percent Ns = Ns Sy 20 Ns % Pq uint
 Start syncing out a transaction group if there's at least this much dirty data
 .Pq as a percentage of Sy zfs_dirty_data_max .
 This should be less than
 .Sy zfs_vdev_async_write_active_min_dirty_percent .
 .
 .It Sy zfs_wrlog_data_max Ns = Pq int
 The upper limit of write-transaction ZIL log data size in bytes.
 Write operations are throttled when approaching the limit until log data is
 cleared out after transaction group sync.
 Because of some overhead, it should be set at least 2 times the size of
 .Sy zfs_dirty_data_max
 .No to prevent harming normal write throughput .
 It also should be smaller than the size of the slog device if slog is present.
 .Pp
 Defaults to
 .Sy zfs_dirty_data_max*2
 .
 .It Sy zfs_fallocate_reserve_percent Ns = Ns Sy 110 Ns % Pq uint
 Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be
 preallocated for a file in order to guarantee that later writes will not
 run out of space.
 Instead,
 .Xr fallocate 2
 space preallocation only checks that sufficient space is currently available
 in the pool or the user's project quota allocation,
 and then creates a sparse file of the requested size.
 The requested space is multiplied by
 .Sy zfs_fallocate_reserve_percent
 to allow additional space for indirect blocks and other internal metadata.
 Setting this to
 .Sy 0
 disables support for
 .Xr fallocate 2
 and causes it to return
 .Sy EOPNOTSUPP .
 .
 .It Sy zfs_fletcher_4_impl Ns = Ns Sy fastest Pq string
 Select a fletcher 4 implementation.
 .Pp
 Supported selectors are:
 .Sy fastest , scalar , sse2 , ssse3 , avx2 , avx512f , avx512bw ,
 .No and Sy aarch64_neon .
 All except
 .Sy fastest No and Sy scalar
 require instruction set extensions to be available,
 and will only appear if ZFS detects that they are present at runtime.
 If multiple implementations of fletcher 4 are available, the
 .Sy fastest
 will be chosen using a micro benchmark.
 Selecting
 .Sy scalar
 results in the original CPU-based calculation being used.
 Selecting any option other than
 .Sy fastest No or Sy scalar
 results in vector instructions
 from the respective CPU instruction set being used.
 .
 .It Sy zfs_bclone_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enables access to the block cloning feature.
 If this setting is 0, then even if feature@block_cloning is enabled,
 using functions and system calls that attempt to clone blocks will act as
 though the feature is disabled.
 .
 .It Sy zfs_bclone_wait_dirty Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When set to 1 the FICLONE and FICLONERANGE ioctls will wait for any dirty
 data to be written to disk before proceeding.
 This ensures that the clone operation reliably succeeds, even if a file is
 modified and then immediately cloned.
 Note that for small files this may be slower than simply copying the file.
 When set to 0 the clone operation will immediately fail if it encounters
 any dirty blocks.
 By default waiting is enabled.
 .
 .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string
 Select a BLAKE3 implementation.
 .Pp
 Supported selectors are:
 .Sy cycle , fastest , generic , sse2 , sse41 , avx2 , avx512 .
 All except
 .Sy cycle , fastest No and Sy generic
 require instruction set extensions to be available,
 and will only appear if ZFS detects that they are present at runtime.
 If multiple implementations of BLAKE3 are available, the
 .Sy fastest will be chosen using a micro benchmark. You can see the
 benchmark results by reading this kstat file:
 .Pa /proc/spl/kstat/zfs/chksum_bench .
 .
 .It Sy zfs_free_bpobj_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable/disable the processing of the free_bpobj object.
 .
 .It Sy zfs_async_block_max_blocks Ns = Ns Sy UINT64_MAX Po unlimited Pc Pq u64
 Maximum number of blocks freed in a single TXG.
 .
 .It Sy zfs_max_async_dedup_frees Ns = Ns Sy 100000 Po 10^5 Pc Pq u64
 Maximum number of dedup blocks freed in a single TXG.
 .
 .It Sy zfs_vdev_async_read_max_active Ns = Ns Sy 3 Pq uint
 Maximum asynchronous read I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_read_min_active Ns = Ns Sy 1 Pq uint
 Minimum asynchronous read I/O operation active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_active_max_dirty_percent Ns = Ns Sy 60 Ns % Pq uint
 When the pool has more than this much dirty data, use
 .Sy zfs_vdev_async_write_max_active
 to limit active async writes.
 If the dirty data is between the minimum and maximum,
 the active I/O limit is linearly interpolated.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_active_min_dirty_percent Ns = Ns Sy 30 Ns % Pq uint
 When the pool has less than this much dirty data, use
 .Sy zfs_vdev_async_write_min_active
 to limit active async writes.
 If the dirty data is between the minimum and maximum,
 the active I/O limit is linearly
 interpolated.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_max_active Ns = Ns Sy 10 Pq uint
 Maximum asynchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_async_write_min_active Ns = Ns Sy 2 Pq uint
 Minimum asynchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .Pp
 Lower values are associated with better latency on rotational media but poorer
 resilver performance.
 The default value of
 .Sy 2
 was chosen as a compromise.
 A value of
 .Sy 3
 has been shown to improve resilver performance further at a cost of
 further increasing latency.
 .
 .It Sy zfs_vdev_initializing_max_active Ns = Ns Sy 1 Pq uint
 Maximum initializing I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_initializing_min_active Ns = Ns Sy 1 Pq uint
 Minimum initializing I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_max_active Ns = Ns Sy 1000 Pq uint
 The maximum number of I/O operations active to each device.
 Ideally, this will be at least the sum of each queue's
 .Sy max_active .
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_open_timeout_ms Ns = Ns Sy 1000 Pq uint
 Timeout value to wait before determining a device is missing
 during import.
 This is helpful for transient missing paths due
 to links being briefly removed and recreated in response to
 udev events.
 .
 .It Sy zfs_vdev_rebuild_max_active Ns = Ns Sy 3 Pq uint
 Maximum sequential resilver I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_rebuild_min_active Ns = Ns Sy 1 Pq uint
 Minimum sequential resilver I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_removal_max_active Ns = Ns Sy 2 Pq uint
 Maximum removal I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_removal_min_active Ns = Ns Sy 1 Pq uint
 Minimum removal I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_scrub_max_active Ns = Ns Sy 2 Pq uint
 Maximum scrub I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_scrub_min_active Ns = Ns Sy 1 Pq uint
 Minimum scrub I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_read_max_active Ns = Ns Sy 10 Pq uint
 Maximum synchronous read I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_read_min_active Ns = Ns Sy 10 Pq uint
 Minimum synchronous read I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_write_max_active Ns = Ns Sy 10 Pq uint
 Maximum synchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_sync_write_min_active Ns = Ns Sy 10 Pq uint
 Minimum synchronous write I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_trim_max_active Ns = Ns Sy 2 Pq uint
 Maximum trim/discard I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_trim_min_active Ns = Ns Sy 1 Pq uint
 Minimum trim/discard I/O operations active to each device.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_nia_delay Ns = Ns Sy 5 Pq uint
 For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
 the number of concurrently-active I/O operations is limited to
 .Sy zfs_*_min_active ,
 unless the vdev is "idle".
 When there are no interactive I/O operations active (synchronous or otherwise),
 and
 .Sy zfs_vdev_nia_delay
 operations have completed since the last interactive operation,
 then the vdev is considered to be "idle",
 and the number of concurrently-active non-interactive operations is increased to
 .Sy zfs_*_max_active .
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_nia_credit Ns = Ns Sy 5 Pq uint
 Some HDDs tend to prioritize sequential I/O so strongly, that concurrent
 random I/O latency reaches several seconds.
 On some HDDs this happens even if sequential I/O operations
 are submitted one at a time, and so setting
 .Sy zfs_*_max_active Ns = Sy 1
 does not help.
 To prevent non-interactive I/O, like scrub,
 from monopolizing the device, no more than
 .Sy zfs_vdev_nia_credit operations can be sent
 while there are outstanding incomplete interactive operations.
 This enforced wait ensures the HDD services the interactive I/O
 within a reasonable amount of time.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
 Defines if the driver should retire on a given error type.
 The following options may be bitwise-ored together:
 .TS
 box;
 lbz r l l .
 	Value	Name	Description
 _
 	1	Device	No driver retries on device errors
 	2	Transport	No driver retries on transport errors.
 	4	Driver	No driver retries on driver errors.
 .TE
 .
 .It Sy zfs_vdev_disk_max_segs Ns = Ns Sy 0 Pq uint
 Maximum number of segments to add to a BIO (min 4).
 If this is higher than the maximum allowed by the device queue or the kernel
 itself, it will be clamped.
 Setting it to zero will cause the kernel's ideal size to be used.
 This parameter only applies on Linux.
 .
 .It Sy zfs_expire_snapshot Ns = Ns Sy 300 Ns s Pq int
 Time before expiring
 .Pa .zfs/snapshot .
 .
 .It Sy zfs_admin_snapshot Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Allow the creation, removal, or renaming of entries in the
 .Sy .zfs/snapshot
 directory to cause the creation, destruction, or renaming of snapshots.
 When enabled, this functionality works both locally and over NFS exports
 which have the
 .Em no_root_squash
 option set.
 .
 .It Sy zfs_snapshot_no_setuid Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Whether to disable
 .Em setuid/setgid
 support for snapshot mounts triggered by access to the
 .Sy .zfs/snapshot
 directory by setting the
 .Em nosuid
 mount option.
 .
 .It Sy zfs_flags Ns = Ns Sy 0 Pq int
 Set additional debugging flags.
 The following flags may be bitwise-ored together:
 .TS
 box;
 lbz r l l .
 	Value	Name	Description
 _
 	1	ZFS_DEBUG_DPRINTF	Enable dprintf entries in the debug log.
 *	2	ZFS_DEBUG_DBUF_VERIFY	Enable extra dbuf verifications.
 *	4	ZFS_DEBUG_DNODE_VERIFY	Enable extra dnode verifications.
 	8	ZFS_DEBUG_SNAPNAMES	Enable snapshot name verification.
 *	16	ZFS_DEBUG_MODIFY	Check for illegally modified ARC buffers.
 	64	ZFS_DEBUG_ZIO_FREE	Enable verification of block frees.
 	128	ZFS_DEBUG_HISTOGRAM_VERIFY	Enable extra spacemap histogram verifications.
 	256	ZFS_DEBUG_METASLAB_VERIFY	Verify space accounting on disk matches in-memory \fBrange_trees\fP.
 	512	ZFS_DEBUG_SET_ERROR	Enable \fBSET_ERROR\fP and dprintf entries in the debug log.
 	1024	ZFS_DEBUG_INDIRECT_REMAP	Verify split blocks created by device removal.
 	2048	ZFS_DEBUG_TRIM	Verify TRIM ranges are always within the allocatable range tree.
 	4096	ZFS_DEBUG_LOG_SPACEMAP	Verify that the log summary is consistent with the spacemap log
 			       and enable \fBzfs_dbgmsgs\fP for metaslab loading and flushing.
 	8192	ZFS_DEBUG_METASLAB_ALLOC	Enable debugging messages when allocations fail.
 	16384	ZFS_DEBUG_BRT	Enable BRT-related debugging messages.
 	32768	ZFS_DEBUG_RAIDZ_RECONSTRUCT	Enabled debugging messages for raidz reconstruction.
 	65536	ZFS_DEBUG_DDT	Enable DDT-related debugging messages.
 .TE
 .Sy \& * No Requires debug build .
 .
 .It Sy zfs_btree_verify_intensity Ns = Ns Sy 0 Pq uint
 Enables btree verification.
 The following settings are cumulative:
 .TS
 box;
 lbz r l l .
 	Value	Description
 
 	1	Verify height.
 	2	Verify pointers from children to parent.
 	3	Verify element counts.
 	4	Verify element order. (expensive)
 *	5	Verify unused memory is poisoned. (expensive)
 .TE
 .Sy \& * No Requires debug build .
 .
 .It Sy zfs_free_leak_on_eio Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If destroy encounters an
 .Sy EIO
 while reading metadata (e.g. indirect blocks),
 space referenced by the missing metadata can not be freed.
 Normally this causes the background destroy to become "stalled",
 as it is unable to make forward progress.
 While in this stalled state, all remaining space to free
 from the error-encountering filesystem is "temporarily leaked".
 Set this flag to cause it to ignore the
 .Sy EIO ,
 permanently leak the space from indirect blocks that can not be read,
 and continue to free everything else that it can.
 .Pp
 The default "stalling" behavior is useful if the storage partially
 fails (i.e. some but not all I/O operations fail), and then later recovers.
 In this case, we will be able to continue pool operations while it is
 partially failed, and when it recovers, we can continue to free the
 space, with no leaks.
 Note, however, that this case is actually fairly rare.
 .Pp
 Typically pools either
 .Bl -enum -compact -offset 4n -width "1."
 .It
 fail completely (but perhaps temporarily,
 e.g. due to a top-level vdev going offline), or
 .It
 have localized, permanent errors (e.g. disk returns the wrong data
 due to bit flip or firmware bug).
 .El
 In the former case, this setting does not matter because the
 pool will be suspended and the sync thread will not be able to make
 forward progress regardless.
 In the latter, because the error is permanent, the best we can do
 is leak the minimum amount of space,
 which is what setting this flag will do.
 It is therefore reasonable for this flag to normally be set,
 but we chose the more conservative approach of not setting it,
 so that there is no possibility of
 leaking space in the "partial temporary" failure case.
 .
 .It Sy zfs_free_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1s Pc Pq uint
 During a
 .Nm zfs Cm destroy
 operation using the
 .Sy async_destroy
 feature,
 a minimum of this much time will be spent working on freeing blocks per TXG.
 .
 .It Sy zfs_obsolete_min_time_ms Ns = Ns Sy 500 Ns ms Pq uint
 Similar to
 .Sy zfs_free_min_time_ms ,
 but for cleanup of old indirection records for removed vdevs.
 .
 .It Sy zfs_immediate_write_sz Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq s64
 Largest write size to store the data directly into the ZIL if
 .Sy logbias Ns = Ns Sy latency .
 Larger writes may be written indirectly similar to
 .Sy logbias Ns = Ns Sy throughput .
 In presence of SLOG this parameter is ignored, as if it was set to infinity,
 storing all written data into ZIL to not depend on regular vdev latency.
 .
 .It Sy zil_special_is_slog Ns = Ns Sy 1 Ns | Ns 0 Pq int
 When enabled, and written blocks go to normal vdevs, treat present special
 vdevs as SLOGs.
 Blocks that go to the special vdevs are still written indirectly, as with
 .Sy logbias Ns = Ns Sy throughput .
 This parameter is ignored if an SLOG is present.
 .
 .It Sy zfs_initialize_value Ns = Ns Sy 16045690984833335022 Po 0xDEADBEEFDEADBEEE Pc Pq u64
 Pattern written to vdev free space by
 .Xr zpool-initialize 8 .
 .
 .It Sy zfs_initialize_chunk_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 Size of writes used by
 .Xr zpool-initialize 8 .
 This option is used by the test suite.
 .
 .It Sy zfs_livelist_max_entries Ns = Ns Sy 500000 Po 5*10^5 Pc Pq u64
 The threshold size (in block pointers) at which we create a new sub-livelist.
 Larger sublists are more costly from a memory perspective but the fewer
 sublists there are, the lower the cost of insertion.
 .
 .It Sy zfs_livelist_min_percent_shared Ns = Ns Sy 75 Ns % Pq int
 If the amount of shared space between a snapshot and its clone drops below
 this threshold, the clone turns off the livelist and reverts to the old
 deletion method.
 This is in place because livelists no long give us a benefit
 once a clone has been overwritten enough.
 .
 .It Sy zfs_livelist_condense_new_alloc Ns = Ns Sy 0 Pq int
 Incremented each time an extra ALLOC blkptr is added to a livelist entry while
 it is being condensed.
 This option is used by the test suite to track race conditions.
 .
 .It Sy zfs_livelist_condense_sync_cancel Ns = Ns Sy 0 Pq int
 Incremented each time livelist condensing is canceled while in
 .Fn spa_livelist_condense_sync .
 This option is used by the test suite to track race conditions.
 .
 .It Sy zfs_livelist_condense_sync_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int
 When set, the livelist condense process pauses indefinitely before
 executing the synctask \(em
 .Fn spa_livelist_condense_sync .
 This option is used by the test suite to trigger race conditions.
 .
 .It Sy zfs_livelist_condense_zthr_cancel Ns = Ns Sy 0 Pq int
 Incremented each time livelist condensing is canceled while in
 .Fn spa_livelist_condense_cb .
 This option is used by the test suite to track race conditions.
 .
 .It Sy zfs_livelist_condense_zthr_pause Ns = Ns Sy 0 Ns | Ns 1 Pq int
 When set, the livelist condense process pauses indefinitely before
 executing the open context condensing work in
 .Fn spa_livelist_condense_cb .
 This option is used by the test suite to trigger race conditions.
 .
 .It Sy zfs_lua_max_instrlimit Ns = Ns Sy 100000000 Po 10^8 Pc Pq u64
 The maximum execution time limit that can be set for a ZFS channel program,
 specified as a number of Lua instructions.
 .
 .It Sy zfs_lua_max_memlimit Ns = Ns Sy 104857600 Po 100 MiB Pc Pq u64
 The maximum memory limit that can be set for a ZFS channel program, specified
 in bytes.
 .
 .It Sy zfs_max_dataset_nesting Ns = Ns Sy 50 Pq int
 The maximum depth of nested datasets.
 This value can be tuned temporarily to
 fix existing datasets that exceed the predefined limit.
 .
 .It Sy zfs_max_log_walking Ns = Ns Sy 5 Pq u64
 The number of past TXGs that the flushing algorithm of the log spacemap
 feature uses to estimate incoming log blocks.
 .
 .It Sy zfs_max_logsm_summary_length Ns = Ns Sy 10 Pq u64
 Maximum number of rows allowed in the summary of the spacemap log.
 .
 .It Sy zfs_max_recordsize Ns = Ns Sy 16777216 Po 16 MiB Pc Pq uint
 We currently support block sizes from
 .Em 512 Po 512 B Pc No to Em 16777216 Po 16 MiB Pc .
 The benefits of larger blocks, and thus larger I/O,
 need to be weighed against the cost of COWing a giant block to modify one byte.
 Additionally, very large blocks can have an impact on I/O latency,
 and also potentially on the memory allocator.
 Therefore, we formerly forbade creating blocks larger than 1M.
 Larger blocks could be created by changing it,
 and pools with larger blocks can always be imported and used,
 regardless of this setting.
 .Pp
 Note that it is still limited by default to
 .Ar 1 MiB
 on x86_32, because Linux's
 3/1 memory split doesn't leave much room for 16M chunks.
 .
 .It Sy zfs_allow_redacted_dataset_mount Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Allow datasets received with redacted send/receive to be mounted.
 Normally disabled because these datasets may be missing key data.
 .
 .It Sy zfs_min_metaslabs_to_flush Ns = Ns Sy 1 Pq u64
 Minimum number of metaslabs to flush per dirty TXG.
 .
 .It Sy zfs_metaslab_fragmentation_threshold Ns = Ns Sy 77 Ns % Pq uint
 Allow metaslabs to keep their active state as long as their fragmentation
 percentage is no more than this value.
 An active metaslab that exceeds this threshold
 will no longer keep its active status allowing better metaslabs to be selected.
 .
 .It Sy zfs_mg_fragmentation_threshold Ns = Ns Sy 95 Ns % Pq uint
 Metaslab groups are considered eligible for allocations if their
 fragmentation metric (measured as a percentage) is less than or equal to
 this value.
 If a metaslab group exceeds this threshold then it will be
 skipped unless all metaslab groups within the metaslab class have also
 crossed this threshold.
 .
 .It Sy zfs_mg_noalloc_threshold Ns = Ns Sy 0 Ns % Pq uint
 Defines a threshold at which metaslab groups should be eligible for allocations.
 The value is expressed as a percentage of free space
 beyond which a metaslab group is always eligible for allocations.
 If a metaslab group's free space is less than or equal to the
 threshold, the allocator will avoid allocating to that group
 unless all groups in the pool have reached the threshold.
 Once all groups have reached the threshold, all groups are allowed to accept
 allocations.
 The default value of
 .Sy 0
 disables the feature and causes all metaslab groups to be eligible for
 allocations.
 .Pp
 This parameter allows one to deal with pools having heavily imbalanced
 vdevs such as would be the case when a new vdev has been added.
 Setting the threshold to a non-zero percentage will stop allocations
 from being made to vdevs that aren't filled to the specified percentage
 and allow lesser filled vdevs to acquire more allocations than they
 otherwise would under the old
 .Sy zfs_mg_alloc_failures
 facility.
 .
 .It Sy zfs_ddt_data_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If enabled, ZFS will place DDT data into the special allocation class.
 .
 .It Sy zfs_user_indirect_is_special Ns = Ns Sy 1 Ns | Ns 0 Pq int
 If enabled, ZFS will place user data indirect blocks
 into the special allocation class.
 .
 .It Sy zfs_multihost_history Ns = Ns Sy 0 Pq uint
 Historical statistics for this many latest multihost updates will be available
 in
 .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /multihost .
 .
 .It Sy zfs_multihost_interval Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq u64
 Used to control the frequency of multihost writes which are performed when the
 .Sy multihost
 pool property is on.
 This is one of the factors used to determine the
 length of the activity check during import.
 .Pp
 The multihost write period is
 .Sy zfs_multihost_interval No / Sy leaf-vdevs .
 On average a multihost write will be issued for each leaf vdev
 every
 .Sy zfs_multihost_interval
 milliseconds.
 In practice, the observed period can vary with the I/O load
 and this observed value is the delay which is stored in the uberblock.
 .
 .It Sy zfs_multihost_import_intervals Ns = Ns Sy 20 Pq uint
 Used to control the duration of the activity test on import.
 Smaller values of
 .Sy zfs_multihost_import_intervals
 will reduce the import time but increase
 the risk of failing to detect an active pool.
 The total activity check time is never allowed to drop below one second.
 .Pp
 On import the activity check waits a minimum amount of time determined by
 .Sy zfs_multihost_interval No \(mu Sy zfs_multihost_import_intervals ,
 or the same product computed on the host which last had the pool imported,
 whichever is greater.
 The activity check time may be further extended if the value of MMP
 delay found in the best uberblock indicates actual multihost updates happened
 at longer intervals than
 .Sy zfs_multihost_interval .
 A minimum of
 .Em 100 ms
 is enforced.
 .Pp
 .Sy 0 No is equivalent to Sy 1 .
 .
 .It Sy zfs_multihost_fail_intervals Ns = Ns Sy 10 Pq uint
 Controls the behavior of the pool when multihost write failures or delays are
 detected.
 .Pp
 When
 .Sy 0 ,
 multihost write failures or delays are ignored.
 The failures will still be reported to the ZED which depending on
 its configuration may take action such as suspending the pool or offlining a
 device.
 .Pp
 Otherwise, the pool will be suspended if
 .Sy zfs_multihost_fail_intervals No \(mu Sy zfs_multihost_interval
 milliseconds pass without a successful MMP write.
 This guarantees the activity test will see MMP writes if the pool is imported.
 .Sy 1 No is equivalent to Sy 2 ;
 this is necessary to prevent the pool from being suspended
 due to normal, small I/O latency variations.
 .
 .It Sy zfs_no_scrub_io Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Set to disable scrub I/O.
 This results in scrubs not actually scrubbing data and
 simply doing a metadata crawl of the pool instead.
 .
 .It Sy zfs_no_scrub_prefetch Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Set to disable block prefetching for scrubs.
 .
 .It Sy zfs_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable cache flush operations on disks when writing.
 Setting this will cause pool corruption on power loss
 if a volatile out-of-order write cache is enabled.
 .
 .It Sy zfs_nopwrite_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Allow no-operation writes.
 The occurrence of nopwrites will further depend on other pool properties
 .Pq i.a. the checksumming and compression algorithms .
 .
 .It Sy zfs_dmu_offset_next_sync Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Enable forcing TXG sync to find holes.
 When enabled forces ZFS to sync data when
 .Sy SEEK_HOLE No or Sy SEEK_DATA
 flags are used allowing holes in a file to be accurately reported.
 When disabled holes will not be reported in recently dirtied files.
 .
 .It Sy zfs_pd_bytes_max Ns = Ns Sy 52428800 Ns B Po 50 MiB Pc Pq int
 The number of bytes which should be prefetched during a pool traversal, like
 .Nm zfs Cm send
 or other data crawling operations.
 .
 .It Sy zfs_traverse_indirect_prefetch_limit Ns = Ns Sy 32 Pq uint
 The number of blocks pointed by indirect (non-L0) block which should be
 prefetched during a pool traversal, like
 .Nm zfs Cm send
 or other data crawling operations.
 .
 .It Sy zfs_per_txg_dirty_frees_percent Ns = Ns Sy 30 Ns % Pq u64
 Control percentage of dirtied indirect blocks from frees allowed into one TXG.
 After this threshold is crossed, additional frees will wait until the next TXG.
 .Sy 0 No disables this throttle .
 .
 .It Sy zfs_prefetch_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable predictive prefetch.
 Note that it leaves "prescient" prefetch
 .Pq for, e.g., Nm zfs Cm send
 intact.
 Unlike predictive prefetch, prescient prefetch never issues I/O
 that ends up not being needed, so it can't hurt performance.
 .
 .It Sy zfs_qat_checksum_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable QAT hardware acceleration for SHA256 checksums.
 May be unset after the ZFS modules have been loaded to initialize the QAT
 hardware as long as support is compiled in and the QAT driver is present.
 .
 .It Sy zfs_qat_compress_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable QAT hardware acceleration for gzip compression.
 May be unset after the ZFS modules have been loaded to initialize the QAT
 hardware as long as support is compiled in and the QAT driver is present.
 .
 .It Sy zfs_qat_encrypt_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable QAT hardware acceleration for AES-GCM encryption.
 May be unset after the ZFS modules have been loaded to initialize the QAT
 hardware as long as support is compiled in and the QAT driver is present.
 .
 .It Sy zfs_vnops_read_chunk_size Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64
 Bytes to read per chunk.
 .
 .It Sy zfs_read_history Ns = Ns Sy 0 Pq uint
 Historical statistics for this many latest reads will be available in
 .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /reads .
 .
 .It Sy zfs_read_history_hits Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Include cache hits in read history
 .
 .It Sy zfs_rebuild_max_segment Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq u64
 Maximum read segment size to issue when sequentially resilvering a
 top-level vdev.
 .
 .It Sy zfs_rebuild_scrub_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Automatically start a pool scrub when the last active sequential resilver
 completes in order to verify the checksums of all blocks which have been
 resilvered.
 This is enabled by default and strongly recommended.
 .
 .It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Maximum amount of I/O that can be concurrently issued for a sequential
 resilver per leaf device, given in bytes.
 .
 .It Sy zfs_reconstruct_indirect_combinations_max Ns = Ns Sy 4096 Pq int
 If an indirect split block contains more than this many possible unique
 combinations when being reconstructed, consider it too computationally
 expensive to check them all.
 Instead, try at most this many randomly selected
 combinations each time the block is accessed.
 This allows all segment copies to participate fairly
 in the reconstruction when all combinations
 cannot be checked and prevents repeated use of one bad copy.
 .
 .It Sy zfs_recover Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Set to attempt to recover from fatal errors.
 This should only be used as a last resort,
 as it typically results in leaked space, or worse.
 .
 .It Sy zfs_removal_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Ignore hard I/O errors during device removal.
 When set, if a device encounters a hard I/O error during the removal process
 the removal will not be canceled.
 This can result in a normally recoverable block becoming permanently damaged
 and is hence not recommended.
 This should only be used as a last resort when the
 pool cannot be returned to a healthy state prior to removing the device.
 .
 .It Sy zfs_removal_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 This is used by the test suite so that it can ensure that certain actions
 happen while in the middle of a removal.
 .
 .It Sy zfs_remove_max_segment Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 The largest contiguous segment that we will attempt to allocate when removing
 a device.
 If there is a performance problem with attempting to allocate large blocks,
 consider decreasing this.
 The default value is also the maximum.
 .
 .It Sy zfs_resilver_disable_defer Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Ignore the
 .Sy resilver_defer
 feature, causing an operation that would start a resilver to
 immediately restart the one in progress.
 .
 .It Sy zfs_resilver_defer_percent Ns = Ns Sy 10 Ns % Pq uint
 If the ongoing resilver progress is below this threshold, a new resilver will
 restart from scratch instead of being deferred after the current one finishes,
 even if the
 .Sy resilver_defer
 feature is enabled.
 .
 .It Sy zfs_resilver_min_time_ms Ns = Ns Sy 3000 Ns ms Po 3 s Pc Pq uint
 Resilvers are processed by the sync thread.
 While resilvering, it will spend at least this much time
 working on a resilver between TXG flushes.
 .
 .It Sy zfs_scan_ignore_errors Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If set, remove the DTL (dirty time list) upon completion of a pool scan (scrub),
 even if there were unrepairable errors.
 Intended to be used during pool repair or recovery to
 stop resilvering when the pool is next imported.
 .
 .It Sy zfs_scrub_after_expand Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Automatically start a pool scrub after a RAIDZ expansion completes
 in order to verify the checksums of all blocks which have been
 copied during the expansion.
 This is enabled by default and strongly recommended.
 .
 .It Sy zfs_scrub_min_time_ms Ns = Ns Sy 1000 Ns ms Po 1 s Pc Pq uint
 Scrubs are processed by the sync thread.
 While scrubbing, it will spend at least this much time
 working on a scrub between TXG flushes.
 .
 .It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint
 Error blocks to be scrubbed in one txg.
 .
 .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint
 To preserve progress across reboots, the sequential scan algorithm periodically
 needs to stop metadata scanning and issue all the verification I/O to disk.
 The frequency of this flushing is determined by this tunable.
 .
 .It Sy zfs_scan_fill_weight Ns = Ns Sy 3 Pq uint
 This tunable affects how scrub and resilver I/O segments are ordered.
 A higher number indicates that we care more about how filled in a segment is,
 while a lower number indicates we care more about the size of the extent without
 considering the gaps within a segment.
 This value is only tunable upon module insertion.
 Changing the value afterwards will have no effect on scrub or resilver
 performance.
 .
 .It Sy zfs_scan_issue_strategy Ns = Ns Sy 0 Pq uint
 Determines the order that data will be verified while scrubbing or resilvering:
 .Bl -tag -compact -offset 4n -width "a"
 .It Sy 1
 Data will be verified as sequentially as possible, given the
 amount of memory reserved for scrubbing
 .Pq see Sy zfs_scan_mem_lim_fact .
 This may improve scrub performance if the pool's data is very fragmented.
 .It Sy 2
 The largest mostly-contiguous chunk of found data will be verified first.
 By deferring scrubbing of small segments, we may later find adjacent data
 to coalesce and increase the segment size.
 .It Sy 0
 .No Use strategy Sy 1 No during normal verification
 .No and strategy Sy 2 No while taking a checkpoint .
 .El
 .
 .It Sy zfs_scan_legacy Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If unset, indicates that scrubs and resilvers will gather metadata in
 memory before issuing sequential I/O.
 Otherwise indicates that the legacy algorithm will be used,
 where I/O is initiated as soon as it is discovered.
 Unsetting will not affect scrubs or resilvers that are already in progress.
 .
 .It Sy zfs_scan_max_ext_gap Ns = Ns Sy 2097152 Ns B Po 2 MiB Pc Pq int
 Sets the largest gap in bytes between scrub/resilver I/O operations
 that will still be considered sequential for sorting purposes.
 Changing this value will not
 affect scrubs or resilvers that are already in progress.
 .
 .It Sy zfs_scan_mem_lim_fact Ns = Ns Sy 20 Ns ^-1 Pq uint
 Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
 This tunable determines the hard limit for I/O sorting memory usage.
 When the hard limit is reached we stop scanning metadata and start issuing
 data verification I/O.
 This is done until we get below the soft limit.
 .
 .It Sy zfs_scan_mem_lim_soft_fact Ns = Ns Sy 20 Ns ^-1 Pq uint
 The fraction of the hard limit used to determined the soft limit for I/O sorting
 by the sequential scan algorithm.
 When we cross this limit from below no action is taken.
 When we cross this limit from above it is because we are issuing verification
 I/O.
 In this case (unless the metadata scan is done) we stop issuing verification I/O
 and start scanning metadata again until we get to the hard limit.
 .
 .It Sy zfs_scan_report_txgs Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When reporting resilver throughput and estimated completion time use the
 performance observed over roughly the last
 .Sy zfs_scan_report_txgs
 TXGs.
 When set to zero performance is calculated over the time between checkpoints.
 .
 .It Sy zfs_scan_strict_mem_lim Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Enforce tight memory limits on pool scans when a sequential scan is in progress.
 When disabled, the memory limit may be exceeded by fast disks.
 .
 .It Sy zfs_scan_suspend_progress Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Freezes a scrub/resilver in progress without actually pausing it.
 Intended for testing/debugging.
 .
 .It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int
 Maximum amount of data that can be concurrently issued at once for scrubs and
 resilvers per leaf device, given in bytes.
 .
 .It Sy zfs_send_corrupt_data Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Allow sending of corrupt data (ignore read/checksum errors when sending).
 .
 .It Sy zfs_send_unmodified_spill_blocks Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Include unmodified spill blocks in the send stream.
 Under certain circumstances, previous versions of ZFS could incorrectly
 remove the spill block from an existing object.
 Including unmodified copies of the spill blocks creates a backwards-compatible
 stream which will recreate a spill block if it was incorrectly removed.
 .
 .It Sy zfs_send_no_prefetch_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint
 The fill fraction of the
 .Nm zfs Cm send
 internal queues.
 The fill fraction controls the timing with which internal threads are woken up.
 .
 .It Sy zfs_send_no_prefetch_queue_length Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 The maximum number of bytes allowed in
 .Nm zfs Cm send Ns 's
 internal queues.
 .
 .It Sy zfs_send_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint
 The fill fraction of the
 .Nm zfs Cm send
 prefetch queue.
 The fill fraction controls the timing with which internal threads are woken up.
 .
 .It Sy zfs_send_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 The maximum number of bytes allowed that will be prefetched by
 .Nm zfs Cm send .
 This value must be at least twice the maximum block size in use.
 .
 .It Sy zfs_recv_queue_ff Ns = Ns Sy 20 Ns ^\-1 Pq uint
 The fill fraction of the
 .Nm zfs Cm receive
 queue.
 The fill fraction controls the timing with which internal threads are woken up.
 .
 .It Sy zfs_recv_queue_length Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq uint
 The maximum number of bytes allowed in the
 .Nm zfs Cm receive
 queue.
 This value must be at least twice the maximum block size in use.
 .
 .It Sy zfs_recv_write_batch_size Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 The maximum amount of data, in bytes, that
 .Nm zfs Cm receive
 will write in one DMU transaction.
 This is the uncompressed size, even when receiving a compressed send stream.
 This setting will not reduce the write size below a single block.
 Capped at a maximum of
 .Sy 32 MiB .
 .
 .It Sy zfs_recv_best_effort_corrective Ns = Ns Sy 0 Pq int
 When this variable is set to non-zero a corrective receive:
 .Bl -enum -compact -offset 4n -width "1."
 .It
 Does not enforce the restriction of source & destination snapshot GUIDs
 matching.
 .It
 If there is an error during healing, the healing receive is not
 terminated instead it moves on to the next record.
 .El
 .
 .It Sy zfs_override_estimate_recordsize Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Setting this variable overrides the default logic for estimating block
 sizes when doing a
 .Nm zfs Cm send .
 The default heuristic is that the average block size
 will be the current recordsize.
 Override this value if most data in your dataset is not of that size
 and you require accurate zfs send size estimates.
 .
 .It Sy zfs_sync_pass_deferred_free Ns = Ns Sy 2 Pq uint
 Flushing of data to disk is done in passes.
 Defer frees starting in this pass.
 .
 .It Sy zfs_spa_discard_memory_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int
 Maximum memory used for prefetching a checkpoint's space map on each
 vdev while discarding the checkpoint.
 .
 .It Sy zfs_spa_note_txg_time Ns = Ns Sy 600 Pq uint
 This parameter defines, in seconds, how often the TXG time database will record
 a new TXG if it has changed.
 After the specified time interval has passed, and if the TXG number has changed,
 the new value is recorded in the database.
 These timestamps can later be used for more granular operations, such as
 scrubbing.
 .
 .It Sy zfs_spa_flush_txg_time Ns = Ns Sy 600 Pq uint
 This parameter defines, in seconds, how often the ZFS will flush
 the TXG time database to disk.
 It ensures that the data is actually written to persistent storage, which helps
 preserve the database in case of unexpected shutdown.
 The database is also automatically flushed during the export sequence.
 .
 .It Sy zfs_special_class_metadata_reserve_pct Ns = Ns Sy 25 Ns % Pq uint
 Only allow small data blocks to be allocated on the special and dedup vdev
 types when the available free space percentage on these vdevs exceeds this
 value.
 This ensures reserved space is available for pool metadata as the
 special vdevs approach capacity.
 .
 .It Sy zfs_sync_pass_dont_compress Ns = Ns Sy 8 Pq uint
 Starting in this sync pass, disable compression (including of metadata).
 With the default setting, in practice, we don't have this many sync passes,
 so this has no effect.
 .Pp
 The original intent was that disabling compression would help the sync passes
 to converge.
 However, in practice, disabling compression increases
 the average number of sync passes; because when we turn compression off,
 many blocks' size will change, and thus we have to re-allocate
 (not overwrite) them.
 It also increases the number of
 .Em 128 KiB
 allocations (e.g. for indirect blocks and spacemaps)
 because these will not be compressed.
 The
 .Em 128 KiB
 allocations are especially detrimental to performance
 on highly fragmented systems, which may have very few free segments of this
 size,
 and may need to load new metaslabs to satisfy these allocations.
 .
 .It Sy zfs_sync_pass_rewrite Ns = Ns Sy 2 Pq uint
 Rewrite new block pointers starting in this pass.
 .
 .It Sy zfs_trim_extent_bytes_max Ns = Ns Sy 134217728 Ns B Po 128 MiB Pc Pq uint
 Maximum size of TRIM command.
 Larger ranges will be split into chunks no larger than this value before
 issuing.
 .
 .It Sy zfs_trim_extent_bytes_min Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 Minimum size of TRIM commands.
 TRIM ranges smaller than this will be skipped,
 unless they're part of a larger range which was chunked.
 This is done because it's common for these small TRIMs
 to negatively impact overall performance.
 .
 .It Sy zfs_trim_metaslab_skip Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Skip uninitialized metaslabs during the TRIM process.
 This option is useful for pools constructed from large thinly-provisioned
 devices
 where TRIM operations are slow.
 As a pool ages, an increasing fraction of the pool's metaslabs
 will be initialized, progressively degrading the usefulness of this option.
 This setting is stored when starting a manual TRIM and will
 persist for the duration of the requested TRIM.
 .
 .It Sy zfs_trim_queue_limit Ns = Ns Sy 10 Pq uint
 Maximum number of queued TRIMs outstanding per leaf vdev.
 The number of concurrent TRIM commands issued to the device is controlled by
 .Sy zfs_vdev_trim_min_active No and Sy zfs_vdev_trim_max_active .
 .
 .It Sy zfs_trim_txg_batch Ns = Ns Sy 32 Pq uint
 The number of transaction groups' worth of frees which should be aggregated
 before TRIM operations are issued to the device.
 This setting represents a trade-off between issuing larger,
 more efficient TRIM operations and the delay
 before the recently trimmed space is available for use by the device.
 .Pp
 Increasing this value will allow frees to be aggregated for a longer time.
 This will result is larger TRIM operations and potentially increased memory
 usage.
 Decreasing this value will have the opposite effect.
 The default of
 .Sy 32
 was determined to be a reasonable compromise.
 .
 .It Sy zfs_txg_history Ns = Ns Sy 100 Pq uint
 Historical statistics for this many latest TXGs will be available in
 .Pa /proc/spl/kstat/zfs/ Ns Ao Ar pool Ac Ns Pa /TXGs .
 .
 .It Sy zfs_txg_timeout Ns = Ns Sy 5 Ns s Pq uint
 Flush dirty data to disk at least every this many seconds (maximum TXG
 duration).
 .
 .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
 Max vdev I/O aggregation size.
 .
 .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
 Max vdev I/O aggregation size for non-rotating media.
 .
 .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O operation
 immediately follows its predecessor on rotational vdevs
 for the purpose of making decisions based on load.
 .
 .It Sy zfs_vdev_mirror_rotating_seek_inc Ns = Ns Sy 5 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O operation
 lacks locality as defined by
 .Sy zfs_vdev_mirror_rotating_seek_offset .
 Operations within this that are not immediately following the previous operation
 are incremented by half.
 .
 .It Sy zfs_vdev_mirror_rotating_seek_offset Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq int
 The maximum distance for the last queued I/O operation in which
 the balancing algorithm considers an operation to have locality.
 .No See Sx ZFS I/O SCHEDULER .
 .
 .It Sy zfs_vdev_mirror_non_rotating_inc Ns = Ns Sy 0 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member on non-rotational vdevs
 when I/O operations do not immediately follow one another.
 .
 .It Sy zfs_vdev_mirror_non_rotating_seek_inc Ns = Ns Sy 1 Pq int
 A number by which the balancing algorithm increments the load calculation for
 the purpose of selecting the least busy mirror member when an I/O operation
 lacks
 locality as defined by the
 .Sy zfs_vdev_mirror_rotating_seek_offset .
 Operations within this that are not immediately following the previous operation
 are incremented by half.
 .
 .It Sy zfs_vdev_read_gap_limit Ns = Ns Sy 32768 Ns B Po 32 KiB Pc Pq uint
 Aggregate read I/O operations if the on-disk gap between them is within this
 threshold.
 .
 .It Sy zfs_vdev_write_gap_limit Ns = Ns Sy 4096 Ns B Po 4 KiB Pc Pq uint
 Aggregate write I/O operations if the on-disk gap between them is within this
 threshold.
 .
 .It Sy zfs_vdev_raidz_impl Ns = Ns Sy fastest Pq string
 Select the raidz parity implementation to use.
 .Pp
 Variants that don't depend on CPU-specific features
 may be selected on module load, as they are supported on all systems.
 The remaining options may only be set after the module is loaded,
 as they are available only if the implementations are compiled in
 and supported on the running system.
 .Pp
 Once the module is loaded,
 .Pa /sys/module/zfs/parameters/zfs_vdev_raidz_impl
 will show the available options,
 with the currently selected one enclosed in square brackets.
 .Pp
 .TS
 lb l l .
 fastest	selected by built-in benchmark
 original	original implementation
 scalar	scalar implementation
 sse2	SSE2 instruction set	64-bit x86
 ssse3	SSSE3 instruction set	64-bit x86
 avx2	AVX2 instruction set	64-bit x86
 avx512f	AVX512F instruction set	64-bit x86
 avx512bw	AVX512F & AVX512BW instruction sets	64-bit x86
 aarch64_neon	NEON	Aarch64/64-bit ARMv8
 aarch64_neonx2	NEON with more unrolling	Aarch64/64-bit ARMv8
 powerpc_altivec	Altivec	PowerPC
 .TE
 .
 .It Sy zfs_zevent_len_max Ns = Ns Sy 512 Pq uint
 Max event queue length.
 Events in the queue can be viewed with
 .Xr zpool-events 8 .
 .
 .It Sy zfs_zevent_retain_max Ns = Ns Sy 2000 Pq int
 Maximum recent zevent records to retain for duplicate checking.
 Setting this to
 .Sy 0
 disables duplicate detection.
 .
 .It Sy zfs_zevent_retain_expire_secs Ns = Ns Sy 900 Ns s Po 15 min Pc Pq int
 Lifespan for a recent ereport that was retained for duplicate checking.
 .
 .It Sy zfs_zil_clean_taskq_maxalloc Ns = Ns Sy 1048576 Pq int
 The maximum number of taskq entries that are allowed to be cached.
 When this limit is exceeded transaction records (itxs)
 will be cleaned synchronously.
 .
 .It Sy zfs_zil_clean_taskq_minalloc Ns = Ns Sy 1024 Pq int
 The number of taskq entries that are pre-populated when the taskq is first
 created and are immediately available for use.
 .
 .It Sy zfs_zil_clean_taskq_nthr_pct Ns = Ns Sy 100 Ns % Pq int
 This controls the number of threads used by
 .Sy dp_zil_clean_taskq .
 The default value of
 .Sy 100%
 will create a maximum of one thread per CPU.
 .
 .It Sy zil_maxblocksize Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
 This sets the maximum block size used by the ZIL.
 On very fragmented pools, lowering this
 .Pq typically to Sy 36 KiB
 can improve performance.
 .
 .It Sy zil_maxcopied Ns = Ns Sy 7680 Ns B Po 7.5 KiB Pc Pq uint
 This sets the maximum number of write bytes logged via WR_COPIED.
 It tunes a tradeoff between additional memory copy and possibly worse log
 space efficiency vs additional range lock/unlock.
 .
 .It Sy zil_nocacheflush Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable the cache flush commands that are normally sent to disk by
 the ZIL after an LWB write has completed.
 Setting this will cause ZIL corruption on power loss
 if a volatile out-of-order write cache is enabled.
 .
 .It Sy zil_replay_disable Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Disable intent logging replay.
 Can be disabled for recovery from corrupted ZIL.
 .
 .It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
 Limit SLOG write size per commit executed with synchronous priority.
 Any writes above that will be executed with lower (asynchronous) priority
 to limit potential SLOG device abuse by single active ZIL writer.
 .
 .It Sy zfs_zil_saxattr Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Setting this tunable to zero disables ZIL logging of new
 .Sy xattr Ns = Ns Sy sa
 records if the
 .Sy org.openzfs:zilsaxattr
 feature is enabled on the pool.
 This would only be necessary to work around bugs in the ZIL logging or replay
 code for this record type.
 The tunable has no effect if the feature is disabled.
 .
 .It Sy zfs_embedded_slog_min_ms Ns = Ns Sy 64 Pq uint
 Usually, one metaslab from each normal and special class vdev is dedicated
 for use by the ZIL to log synchronous writes.
 However, if there are fewer than
 .Sy zfs_embedded_slog_min_ms
 metaslabs in the vdev, this functionality is disabled.
 This ensures that we don't set aside an unreasonable amount of space for the
 ZIL.
 .
 .It Sy zstd_earlyabort_pass Ns = Ns Sy 1 Pq uint
 Whether heuristic for detection of incompressible data with zstd levels >= 3
 using LZ4 and zstd-1 passes is enabled.
 .
 .It Sy zstd_abort_size Ns = Ns Sy 131072 Pq uint
 Minimal uncompressed size (inclusive) of a record before the early abort
 heuristic will be attempted.
 .
 .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If non-zero, the zio deadman will produce debugging messages
 .Pq see Sy zfs_dbgmsg_enable
 for all zios, rather than only for leaf zios possessing a vdev.
 This is meant to be used by developers to gain
 diagnostic information for hang conditions which don't involve a mutex
 or other locking primitive: typically conditions in which a thread in
 the zio pipeline is looping indefinitely.
 .
 .It Sy zio_slow_io_ms Ns = Ns Sy 30000 Ns ms Po 30 s Pc Pq int
 When an I/O operation takes more than this much time to complete,
 it's marked as slow.
 Each slow operation causes a delay zevent.
 Slow I/O counters can be seen with
 .Nm zpool Cm status Fl s .
 .
 .It Sy zio_dva_throttle_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
 Throttle block allocations in the I/O pipeline.
 This allows for dynamic allocation distribution based on device performance.
 .
 .It Sy zfs_xattr_compat Ns = Ns 0 Ns | Ns 1 Pq int
 Control the naming scheme used when setting new xattrs in the user namespace.
 If
 .Sy 0
 .Pq the default on Linux ,
 user namespace xattr names are prefixed with the namespace, to be backwards
 compatible with previous versions of ZFS on Linux.
 If
 .Sy 1
 .Pq the default on Fx ,
 user namespace xattr names are not prefixed, to be backwards compatible with
 previous versions of ZFS on illumos and
 .Fx .
 .Pp
 Either naming scheme can be read on this and future versions of ZFS, regardless
 of this tunable, but legacy ZFS on illumos or
 .Fx
 are unable to read user namespace xattrs written in the Linux format, and
 legacy versions of ZFS on Linux are unable to read user namespace xattrs written
 in the legacy ZFS format.
 .Pp
 An existing xattr with the alternate naming scheme is removed when overwriting
 the xattr so as to not accumulate duplicates.
 .
 .It Sy zio_requeue_io_start_cut_in_line Ns = Ns Sy 0 Ns | Ns 1 Pq int
 Prioritize requeued I/O.
 .
 .It Sy zio_taskq_batch_pct Ns = Ns Sy 80 Ns % Pq uint
 Percentage of online CPUs which will run a worker thread for I/O.
 These workers are responsible for I/O work such as compression, encryption,
 checksum and parity calculations.
 Fractional number of CPUs will be rounded down.
 .Pp
 The default value of
 .Sy 80%
 was chosen to avoid using all CPUs which can result in
 latency issues and inconsistent application performance,
 especially when slower compression and/or checksumming is enabled.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_batch_tpq Ns = Ns Sy 0 Pq uint
 Number of worker threads per taskq.
 Higher values improve I/O ordering and CPU utilization,
 while lower reduce lock contention.
 Set value only applies to pools imported/created after that.
 .Pp
 If
 .Sy 0 ,
 generate a system-dependent value close to 6 threads per taskq.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_write_tpq Ns = Ns Sy 16 Pq uint
 Determines the minimum number of threads per write issue taskq.
 Higher values improve CPU utilization on high throughput,
 while lower reduce taskq locks contention on high IOPS.
 Set value only applies to pools imported/created after that.
 .
 .It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp
 Set the queue and thread configuration for the IO read queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
 Set values only apply to pools imported/created after that.
 .
 .It Sy zio_taskq_write Ns = Ns Sy sync null scale null Pq charp
 Set the queue and thread configuration for the IO write queues.
 This is an advanced debugging parameter.
 Don't change this unless you understand what it does.
 Set values only apply to pools imported/created after that.
 .
 .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Do not create zvol device nodes.
 This may slightly improve startup time on
 systems with a very large number of zvols.
 .
 .It Sy zvol_major Ns = Ns Sy 230 Pq uint
 Major number for zvol block devices.
 .
 .It Sy zvol_max_discard_blocks Ns = Ns Sy 16384 Pq long
 Discard (TRIM) operations done on zvols will be done in batches of this
 many blocks, where block size is determined by the
 .Sy volblocksize
 property of a zvol.
 .
 .It Sy zvol_prefetch_bytes Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint
 When adding a zvol to the system, prefetch this many bytes
 from the start and end of the volume.
 Prefetching these regions of the volume is desirable,
 because they are likely to be accessed immediately by
 .Xr blkid 8
 or the kernel partitioner.
 .
 .It Sy zvol_request_sync Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 When processing I/O requests for a zvol, submit them synchronously.
 This effectively limits the queue depth to
 .Em 1
 for each I/O submitter.
 When unset, requests are handled asynchronously by a thread pool.
 The number of requests which can be handled concurrently is controlled by
 .Sy zvol_threads .
 .Sy zvol_request_sync
 is ignored when running on a kernel that supports block multiqueue
 .Pq Li blk-mq .
 .
 .It Sy zvol_num_taskqs Ns = Ns Sy 0 Pq uint
 Number of zvol taskqs.
 If
 .Sy 0
 (the default) then scaling is done internally to prefer 6 threads per taskq.
 This only applies on Linux.
 .
 .It Sy zvol_threads Ns = Ns Sy 0 Pq uint
 The number of system wide threads to use for processing zvol block IOs.
 If
 .Sy 0
 (the default) then internally set
 .Sy zvol_threads
 to the number of CPUs present or 32 (whichever is greater).
 .
 .It Sy zvol_blk_mq_threads Ns = Ns Sy 0 Pq uint
 The number of threads per zvol to use for queuing IO requests.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only read and assigned to a zvol at zvol load time.
 If
 .Sy 0
 (the default) then internally set
 .Sy zvol_blk_mq_threads
 to the number of CPUs present.
 .
 .It Sy zvol_use_blk_mq Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Set to
 .Sy 1
 to use the
 .Li blk-mq
 API for zvols.
 Set to
 .Sy 0
 (the default) to use the legacy zvol APIs.
 This setting can give better or worse zvol performance depending on
 the workload.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only read and assigned to a zvol at zvol load time.
 .
 .It Sy zvol_blk_mq_blocks_per_thread Ns = Ns Sy 8 Pq uint
 If
 .Sy zvol_use_blk_mq
 is enabled, then process this number of
 .Sy volblocksize Ns -sized blocks per zvol thread.
 This tunable can be use to favor better performance for zvol reads (lower
 values) or writes (higher values).
 If set to
 .Sy 0 ,
 then the zvol layer will process the maximum number of blocks
 per thread that it can.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only applied at each zvol's load time.
 .
 .It Sy zvol_blk_mq_queue_depth Ns = Ns Sy 0 Pq uint
 The queue_depth value for the zvol
 .Li blk-mq
 interface.
 This parameter will only appear if your kernel supports
 .Li blk-mq
 and is only applied at each zvol's load time.
 If
 .Sy 0
 (the default) then use the kernel's default queue depth.
 Values are clamped to the kernel's
 .Dv BLKDEV_MIN_RQ
 and
 .Dv BLKDEV_MAX_RQ Ns / Ns Dv BLKDEV_DEFAULT_RQ
 limits.
 .
 .It Sy zvol_volmode Ns = Ns Sy 1 Pq uint
 Defines zvol block devices behavior when
 .Sy volmode Ns = Ns Sy default :
 .Bl -tag -compact -offset 4n -width "a"
 .It Sy 1
 .No equivalent to Sy full
 .It Sy 2
 .No equivalent to Sy dev
 .It Sy 3
 .No equivalent to Sy none
 .El
 .
 .It Sy zvol_enforce_quotas Ns = Ns Sy 0 Ns | Ns 1 Pq uint
 Enable strict ZVOL quota enforcement.
 The strict quota enforcement may have a performance impact.
 .El
 .
 .Sh ZFS I/O SCHEDULER
 ZFS issues I/O operations to leaf vdevs to satisfy and complete I/O operations.
 The scheduler determines when and in what order those operations are issued.
 The scheduler divides operations into five I/O classes,
 prioritized in the following order: sync read, sync write, async read,
 async write, and scrub/resilver.
 Each queue defines the minimum and maximum number of concurrent operations
 that may be issued to the device.
 In addition, the device has an aggregate maximum,
 .Sy zfs_vdev_max_active .
 Note that the sum of the per-queue minima must not exceed the aggregate maximum.
 If the sum of the per-queue maxima exceeds the aggregate maximum,
 then the number of active operations may reach
 .Sy zfs_vdev_max_active ,
 in which case no further operations will be issued,
 regardless of whether all per-queue minima have been met.
 .Pp
 For many physical devices, throughput increases with the number of
 concurrent operations, but latency typically suffers.
 Furthermore, physical devices typically have a limit
 at which more concurrent operations have no
 effect on throughput or can actually cause it to decrease.
 .Pp
 The scheduler selects the next operation to issue by first looking for an
 I/O class whose minimum has not been satisfied.
 Once all are satisfied and the aggregate maximum has not been hit,
 the scheduler looks for classes whose maximum has not been satisfied.
 Iteration through the I/O classes is done in the order specified above.
 No further operations are issued
 if the aggregate maximum number of concurrent operations has been hit,
 or if there are no operations queued for an I/O class that has not hit its
 maximum.
 Every time an I/O operation is queued or an operation completes,
 the scheduler looks for new operations to issue.
 .Pp
 In general, smaller
 .Sy max_active Ns s
 will lead to lower latency of synchronous operations.
 Larger
 .Sy max_active Ns s
 may lead to higher overall throughput, depending on underlying storage.
 .Pp
 The ratio of the queues'
 .Sy max_active Ns s
 determines the balance of performance between reads, writes, and scrubs.
 For example, increasing
 .Sy zfs_vdev_scrub_max_active
 will cause the scrub or resilver to complete more quickly,
 but reads and writes to have higher latency and lower throughput.
 .Pp
 All I/O classes have a fixed maximum number of outstanding operations,
 except for the async write class.
 Asynchronous writes represent the data that is committed to stable storage
 during the syncing stage for transaction groups.
 Transaction groups enter the syncing state periodically,
 so the number of queued async writes will quickly burst up
 and then bleed down to zero.
 Rather than servicing them as quickly as possible,
 the I/O scheduler changes the maximum number of active async write operations
 according to the amount of dirty data in the pool.
 Since both throughput and latency typically increase with the number of
 concurrent operations issued to physical devices, reducing the
 burstiness in the number of simultaneous operations also stabilizes the
 response time of operations from other queues, in particular synchronous ones.
 In broad strokes, the I/O scheduler will issue more concurrent operations
 from the async write queue as there is more dirty data in the pool.
 .
 .Ss Async Writes
 The number of concurrent operations issued for the async write I/O class
 follows a piece-wise linear function defined by a few adjustable points:
 .Bd -literal
        |              o---------| <-- \fBzfs_vdev_async_write_max_active\fP
   ^    |             /^         |
   |    |            / |         |
 active |           /  |         |
  I/O   |          /   |         |
 count  |         /    |         |
        |        /     |         |
        |-------o      |         | <-- \fBzfs_vdev_async_write_min_active\fP
       0|_______^______|_________|
        0%      |      |       100% of \fBzfs_dirty_data_max\fP
                |      |
                |      `-- \fBzfs_vdev_async_write_active_max_dirty_percent\fP
                `--------- \fBzfs_vdev_async_write_active_min_dirty_percent\fP
 .Ed
 .Pp
 Until the amount of dirty data exceeds a minimum percentage of the dirty
 data allowed in the pool, the I/O scheduler will limit the number of
 concurrent operations to the minimum.
 As that threshold is crossed, the number of concurrent operations issued
 increases linearly to the maximum at the specified maximum percentage
 of the dirty data allowed in the pool.
 .Pp
 Ideally, the amount of dirty data on a busy pool will stay in the sloped
 part of the function between
 .Sy zfs_vdev_async_write_active_min_dirty_percent
 and
 .Sy zfs_vdev_async_write_active_max_dirty_percent .
 If it exceeds the maximum percentage,
 this indicates that the rate of incoming data is
 greater than the rate that the backend storage can handle.
 In this case, we must further throttle incoming writes,
 as described in the next section.
 .
 .Sh ZFS TRANSACTION DELAY
 We delay transactions when we've determined that the backend storage
 isn't able to accommodate the rate of incoming writes.
 .Pp
 If there is already a transaction waiting, we delay relative to when
 that transaction will finish waiting.
 This way the calculated delay time
 is independent of the number of threads concurrently executing transactions.
 .Pp
 If we are the only waiter, wait relative to when the transaction started,
 rather than the current time.
 This credits the transaction for "time already served",
 e.g. reading indirect blocks.
 .Pp
 The minimum time for a transaction to take is calculated as
 .D1 min_time = min( Ns Sy zfs_delay_scale No \(mu Po Sy dirty No \- Sy min Pc / Po Sy max No \- Sy dirty Pc , 100ms)
 .Pp
 The delay has two degrees of freedom that can be adjusted via tunables.
 The percentage of dirty data at which we start to delay is defined by
 .Sy zfs_delay_min_dirty_percent .
 This should typically be at or above
 .Sy zfs_vdev_async_write_active_max_dirty_percent ,
 so that we only start to delay after writing at full speed
 has failed to keep up with the incoming write rate.
 The scale of the curve is defined by
 .Sy zfs_delay_scale .
 Roughly speaking, this variable determines the amount of delay at the midpoint
 of the curve.
 .Bd -literal
 delay
  10ms +-------------------------------------------------------------*+
       |                                                             *|
   9ms +                                                             *+
       |                                                             *|
   8ms +                                                             *+
       |                                                            * |
   7ms +                                                            * +
       |                                                            * |
   6ms +                                                            * +
       |                                                            * |
   5ms +                                                           *  +
       |                                                           *  |
   4ms +                                                           *  +
       |                                                           *  |
   3ms +                                                          *   +
       |                                                          *   |
   2ms +                                              (midpoint) *    +
       |                                                  |    **     |
   1ms +                                                  v ***       +
       |             \fBzfs_delay_scale\fP ---------->     ********         |
     0 +-------------------------------------*********----------------+
       0%                    <- \fBzfs_dirty_data_max\fP ->               100%
 .Ed
 .Pp
 Note, that since the delay is added to the outstanding time remaining on the
 most recent transaction it's effectively the inverse of IOPS.
 Here, the midpoint of
 .Em 500 us
 translates to
 .Em 2000 IOPS .
 The shape of the curve
 was chosen such that small changes in the amount of accumulated dirty data
 in the first three quarters of the curve yield relatively small differences
 in the amount of delay.
 .Pp
 The effects can be easier to understand when the amount of delay is
 represented on a logarithmic scale:
 .Bd -literal
 delay
 100ms +-------------------------------------------------------------++
       +                                                              +
       |                                                              |
       +                                                             *+
  10ms +                                                             *+
       +                                                           ** +
       |                                              (midpoint)  **  |
       +                                                  |     **    +
   1ms +                                                  v ****      +
       +             \fBzfs_delay_scale\fP ---------->        *****         +
       |                                             ****             |
       +                                          ****                +
 100us +                                        **                    +
       +                                       *                      +
       |                                      *                       |
       +                                     *                        +
  10us +                                     *                        +
       +                                                              +
       |                                                              |
       +                                                              +
       +--------------------------------------------------------------+
       0%                    <- \fBzfs_dirty_data_max\fP ->               100%
 .Ed
 .Pp
 Note here that only as the amount of dirty data approaches its limit does
 the delay start to increase rapidly.
 The goal of a properly tuned system should be to keep the amount of dirty data
 out of that range by first ensuring that the appropriate limits are set
 for the I/O scheduler to reach optimal throughput on the back-end storage,
 and then by changing the value of
 .Sy zfs_delay_scale
 to increase the steepness of the curve.
diff --git a/sys/contrib/openzfs/man/man5/vdev_id.conf.5 b/sys/contrib/openzfs/man/man5/vdev_id.conf.5
index d2f817631c15..299a23720201 100644
--- a/sys/contrib/openzfs/man/man5/vdev_id.conf.5
+++ b/sys/contrib/openzfs/man/man5/vdev_id.conf.5
@@ -1,258 +1,258 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" This file and its contents are supplied under the terms of the
 .\" Common Development and Distribution License ("CDDL"), version 1.0.
 .\" You may only use this file in accordance with the terms of version
 .\" 1.0 of the CDDL.
 .\"
 .\" A full copy of the text of the CDDL should have accompanied this
 .\" source.  A copy of the CDDL is also available via the Internet at
 .\" http://www.illumos.org/license/CDDL.
 .\"
-.Dd May 26, 2021
+.Dd October 8, 2024
 .Dt VDEV_ID.CONF 5
 .Os
 .
 .Sh NAME
 .Nm vdev_id.conf
 .Nd configuration file for vdev_id(8)
 .Sh DESCRIPTION
 .Nm
 is the configuration file for
 .Xr vdev_id 8 .
 It controls the default behavior of
 .Xr vdev_id 8
 while it is mapping a disk device name to an alias.
 .Pp
 The
 .Nm
 file uses a simple format consisting of a keyword followed by one or
 more values on a single line.
 Any line not beginning with a recognized keyword is ignored.
 Comments may optionally begin with a hash character.
 .Pp
 The following keywords and values are used.
 .Bl -tag -width "-h"
 .It Sy alias Ar name Ar devlink
 Maps a device link in the
 .Pa /dev
 directory hierarchy to a new device name.
 The udev rule defining the device link must have run prior to
 .Xr vdev_id 8 .
 A defined alias takes precedence over a topology-derived name, but the
 two naming methods can otherwise coexist.
 For example, one might name drives in a JBOD with the
 .Sy sas_direct
 topology while naming an internal L2ARC device with an alias.
 .Pp
 .Ar name
 is the name of the link to the device that will by created under
 .Pa /dev/disk/by-vdev .
 .Pp
 .Ar devlink
 is the name of the device link that has already been
 defined by udev.
 This may be an absolute path or the base filename.
 .
 .It Sy channel [ Ns Ar pci_slot ] Ar port Ar name
 Maps a physical path to a channel name (typically representing a single
 disk enclosure).
 .
 .It Sy enclosure_symlinks Sy yes Ns | Ns Sy no
 Additionally create
 .Pa /dev/by-enclosure
 symlinks to the disk enclosure
 .Em sg
 devices using the naming scheme from
 .Pa vdev_id.conf .
 .Sy enclosure_symlinks
 is only allowed for
 .Sy sas_direct
 mode.
 .
 .It Sy enclosure_symlinks_prefix Ar prefix
 Specify the prefix for the enclosure symlinks in the form
 .Pa /dev/by-enclosure/ Ns Ao Ar prefix Ac Ns - Ns Ao Ar channel Ac Ns Aq Ar num
 .Pp
 Defaults to
 .Dq Em enc .
 .
 .It Sy slot Ar prefix Ar new Op Ar channel
 Maps a disk slot number as reported by the operating system to an
 alternative slot number.
 If the
 .Ar channel
 parameter is specified
 then the mapping is only applied to slots in the named channel,
 otherwise the mapping is applied to all channels.
 The first-specified
 .Ar slot
 rule that can match a slot takes precedence.
 Therefore a channel-specific mapping for a given slot should generally appear
 before a generic mapping for the same slot.
 In this way a custom mapping may be applied to a particular channel
 and a default mapping applied to the others.
 .
 .It Sy zpad_slot Ar digits
 Pad slot numbers with zeros to make them
 .Ar digits
 long, which can help to make disk names a consistent length and easier to sort.
 .
 .It Sy multipath Sy yes Ns | Ns Sy no
 Specifies whether
 .Xr vdev_id 8
 will handle only dm-multipath devices.
 If set to
 .Sy yes
 then
 .Xr vdev_id 8
 will examine the first running component disk of a dm-multipath
 device as provided by the driver command to determine the physical path.
 .
 .It Sy topology Sy sas_direct Ns | Ns Sy sas_switch Ns | Ns Sy scsi
 Identifies a physical topology that governs how physical paths are
 mapped to channels:
 .Bl -tag -compact -width "sas_direct and scsi"
 .It Sy sas_direct No and Sy scsi
 channels are uniquely identified by a PCI slot and HBA port number
 .It Sy sas_switch
 channels are uniquely identified by a SAS switch port number
 .El
 .
 .It Sy phys_per_port Ar num
 Specifies the number of PHY devices associated with a SAS HBA port or SAS
 switch port.
 .Xr vdev_id 8
 internally uses this value to determine which HBA or switch port a
 device is connected to.
 The default is
 .Sy 4 .
 .
 .It Sy slot Sy bay Ns | Ns Sy phy Ns | Ns Sy port Ns | Ns Sy id Ns | Ns Sy lun Ns | Ns Sy bay_lun Ns | Ns Sy ses
 Specifies from which element of a SAS identifier the slot number is
 taken.
 The default is
 .Sy bay :
 .Bl -tag -compact -width "port"
 .It Sy bay
 read the slot number from the bay identifier.
 .It Sy phy
 read the slot number from the phy identifier.
 .It Sy port
 use the SAS port as the slot number.
 .It Sy id
 use the scsi id as the slot number.
 .It Sy lun
 use the scsi lun as the slot number.
 .It Sy bay_lun
 read the slot number from the bay identifier and append the lun number.
 Useful for multi-lun multi-actuator hard drives.
 .It Sy ses
 use the SCSI Enclosure Services (SES) enclosure device slot number,
 as reported by
 .Xr sg_ses 8 .
 Intended for use only on systems where
 .Sy bay
 is unsupported,
 noting that
 .Sy port
 and
 .Sy id
 may be unstable across disk replacement.
 .El
 .El
 .
 .Sh FILES
 .Bl -tag -width "-v v"
 .It Pa /etc/zfs/vdev_id.conf
 The configuration file for
 .Xr vdev_id 8 .
 .El
 .
 .Sh EXAMPLES
 A non-multipath configuration with direct-attached SAS enclosures and an
 arbitrary slot re-mapping:
 .Bd -literal -compact -offset Ds
 multipath     no
 topology      sas_direct
 phys_per_port 4
 slot          bay
 
 #       PCI_SLOT HBA PORT  CHANNEL NAME
 channel 85:00.0  1         A
 channel 85:00.0  0         B
 channel 86:00.0  1         C
 channel 86:00.0  0         D
 
 # Custom mapping for Channel A
 
 #    Linux      Mapped
 #    Slot       Slot      Channel
 slot 1          7         A
 slot 2          10        A
 slot 3          3         A
 slot 4          6         A
 
 # Default mapping for B, C, and D
 
 slot 1          4
 slot 2          2
 slot 3          1
 slot 4          3
 .Ed
 .Pp
 A SAS-switch topology.
 Note, that the
 .Ar channel
 keyword takes only two arguments in this example:
 .Bd -literal -compact -offset Ds
 topology      sas_switch
 
 #       SWITCH PORT  CHANNEL NAME
 channel 1            A
 channel 2            B
 channel 3            C
 channel 4            D
 .Ed
 .Pp
 A multipath configuration.
 Note that channel names have multiple definitions - one per physical path:
 .Bd -literal -compact -offset Ds
 multipath yes
 
 #       PCI_SLOT HBA PORT  CHANNEL NAME
 channel 85:00.0  1         A
 channel 85:00.0  0         B
 channel 86:00.0  1         A
 channel 86:00.0  0         B
 .Ed
 .Pp
 A configuration with enclosure_symlinks enabled:
 .Bd -literal -compact -offset Ds
 multipath yes
 enclosure_symlinks yes
 
 #          PCI_ID      HBA PORT     CHANNEL NAME
 channel    05:00.0     1            U
 channel    05:00.0     0            L
 channel    06:00.0     1            U
 channel    06:00.0     0            L
 .Ed
 In addition to the disks symlinks, this configuration will create:
 .Bd -literal -compact -offset Ds
 /dev/by-enclosure/enc-L0
 /dev/by-enclosure/enc-L1
 /dev/by-enclosure/enc-U0
 /dev/by-enclosure/enc-U1
 .Ed
 .Pp
 A configuration using device link aliases:
 .Bd -literal -compact -offset Ds
 #     by-vdev
 #     name     fully qualified or base name of device link
 alias d1       /dev/disk/by-id/wwn-0x5000c5002de3b9ca
 alias d2       wwn-0x5000c5002def789e
 .Ed
 .
 .Sh SEE ALSO
 .Xr vdev_id 8
diff --git a/sys/contrib/openzfs/man/man7/dracut.zfs.7 b/sys/contrib/openzfs/man/man7/dracut.zfs.7
index fb5da553af6e..3d051d4d3343 100644
--- a/sys/contrib/openzfs/man/man7/dracut.zfs.7
+++ b/sys/contrib/openzfs/man/man7/dracut.zfs.7
@@ -1,283 +1,283 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\" SPDX-License-Identifier: 0BSD
 .\"
-.Dd March 28, 2023
+.Dd July 13, 2024
 .Dt DRACUT.ZFS 7
 .Os
 .
 .Sh NAME
 .Nm dracut.zfs
 .Nd overview of ZFS dracut hooks
 .
 .Sh SYNOPSIS
 .Bd -literal -compact
                       parse-zfs.sh \(-> dracut-cmdline.service
                           |                     \(da
                           |                     …
                           |                     \(da
                           \e\(em\(em\(em\(em\(em\(em\(em\(em\(-> dracut-initqueue.service
                                                 |                      zfs-import-opts.sh
    zfs-load-module.service                      \(da                          |       |
      |                  |                sysinit.target                    \(da       |
      \(da                  |                       |        zfs-import-scan.service   \(da
 zfs-import-scan.service \(da                       \(da           | zfs-import-cache.service
      |   zfs-import-cache.service         basic.target      |     |
      \e__________________|                       |           \(da     \(da
                         \(da                       |     zfs-load-key.sh
      zfs-env-bootfs.service                     |         |
                         \(da                       \(da         \(da
                  zfs-import.target \(-> dracut-pre-mount.service
                         |          \(ua            |
                         | dracut-zfs-generator  |
                         | _____________________/|
                         |/                      \(da
                         |                   sysroot.mount \(<-\(em\(em\(em dracut-zfs-generator
                         |                       |
                         |                       \(da
                         |             initrd-root-fs.target \(<-\(em zfs-nonroot-necessities.service
                         |                       |                                 |
                         |                       \(da                                 |
                         \(da             dracut-mount.service                        |
        zfs-snapshot-bootfs.service              |                                 |
                         |                       \(da                                 |
                         \(da                       …                                 |
        zfs-rollback-bootfs.service              |                                 |
                         |                       \(da                                 |
                         |          /sysroot/{usr,etc,lib,&c.} \(<-\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em/
                         |                       |
                         |                       \(da
                         |                initrd-fs.target
                         \e______________________ |
                                                \e|
                                                 \(da
         export-zfs.sh                      initrd.target
               |                                 |
               \(da                                 \(da
    dracut-shutdown.service                      …
                                                 |
                                                 \(da
                  zfs-needshutdown.sh \(-> initrd-cleanup.service
 .Ed
 .Pp
 Compare
 .Xr dracut.bootup 7
 for the full flowchart.
 .
 .Sh DESCRIPTION
 Under dracut, booting with
 .No ZFS-on- Ns Pa /
 is facilitated by a number of hooks in the
 .Nm 90zfs
 module.
 .Pp
 Booting into a ZFS dataset requires
 .Sy mountpoint Ns = Ns Pa /
 to be set on the dataset containing the root filesystem (henceforth "the boot
 dataset") and at the very least either the
 .Sy bootfs
 property to be set to that dataset, or the
 .Sy root=
 kernel cmdline (or dracut drop-in) argument to specify it.
 .Pp
 All children of the boot dataset with
 .Sy canmount Ns = Ns Sy on
 with
 .Sy mountpoint Ns s
 matching
 .Pa /etc , /bin , /lib , /lib?? , /libx32 , No and Pa /usr
 globs are deemed essential and will be mounted as well.
 .Pp
 .Xr zfs-mount-generator 8
 is recommended for proper functioning of the system afterward (correct mount
 properties, remounting, &c.).
 .
 .Sh CMDLINE
 .Ss Standard
 .Bl -tag -compact -width ".Sy root=zfs:AUTO , root=zfs: , root=zfs , Op Sy root="
 .It Sy root=zfs:\& Ns Ar dataset , Sy root=ZFS= Ns Ar dataset
 Use
 .Ar dataset
 as the boot dataset.
 All pluses
 .Pq Sq +
 are replaced with spaces
 .Pq Sq \  .
 .
 .It Sy root=zfs:AUTO , root=zfs:\& , root=zfs , Op Sy root=
 After import, search for the first pool with the
 .Sy bootfs
 property set, use its value as-if specified as the
 .Ar dataset
 above.
 .
 .It Sy rootfstype=zfs root= Ns Ar dataset
 Equivalent to
 .Sy root=zfs:\& Ns Ar dataset .
 .
 .It Sy rootfstype=zfs Op Sy root=
 Equivalent to
 .Sy root=zfs:AUTO .
 .
 .It Sy rootflags= Ns Ar flags
 Mount the boot dataset with
 .Fl o Ar flags ;
 cf.\&
 .Sx Temporary Mount Point Properties
 in
 .Xr zfsprops 7 .
 These properties will not last, since all filesystems will be re-mounted from
 the real root.
 .
 .It Sy debug
 If specified,
 .Nm dracut-zfs-generator
 logs to the journal.
 .El
 .Pp
 Be careful about setting neither
 .Sy rootfstype=zfs
 nor
 .Sy root=zfs:\& Ns Ar dataset
 \(em other automatic boot selection methods, like
 .Nm systemd-gpt-auto-generator
 and
 .Nm systemd-fstab-generator
 might take precedent.
 .
 .Ss ZFS-specific
 .Bl -tag -compact -width ".Sy bootfs.snapshot Ns Op Sy = Ns Ar snapshot-name"
 .It Sy bootfs.snapshot Ns Op Sy = Ns Ar snapshot-name
 Execute
 .Nm zfs Cm snapshot Ar boot-dataset Ns Sy @ Ns Ar snapshot-name
 before pivoting to the real root.
 .Ar snapshot-name
 defaults to the current kernel release.
 .
 .It Sy bootfs.rollback Ns Op Sy = Ns Ar snapshot-name
 Execute
 .Nm zfs Cm rollback Fl Rf Ar boot-dataset Ns Sy @ Ns Ar snapshot-name
 before pivoting to the real root.
 .Ar snapshot-name
 defaults to the current kernel release.
 .
 .It Sy spl_hostid= Ns Ar host-id
 Use
 .Xr zgenhostid 8
 to set the host ID to
 .Ar host-id ;
 otherwise,
 .Pa /etc/hostid
 inherited from the real root is used.
 .
 .It Sy zfs_force , zfs.force , zfsforce
 Appends
 .Fl f
 to all
 .Nm zpool Cm import
 invocations; primarily useful in conjunction with
 .Sy spl_hostid= ,
 or if no host ID was inherited.
 .El
 .
 .Sh FILES
 .Bl -tag -width 0
 .It Pa parse-zfs.sh Pq Sy cmdline
 Processes
 .Sy spl_hostid= .
 If
 .Sy root=
 matches a known pattern, above, provides
 .Pa /dev/root
 and delays the initqueue until
 .Xr zfs 4
 is loaded,
 .
 .It Pa zfs-import-opts.sh Pq Nm systemd No environment generator
 Turns
 .Sy zfs_force , zfs.force , No or Sy zfsforce
 into
 .Ev ZPOOL_IMPORT_OPTS Ns = Ns Fl f
 for
 .Pa zfs-import-scan.service
 or
 .Pa zfs-import-cache.service .
 .
 .It Pa zfs-load-key.sh Pq Sy pre-mount
 Loads encryption keys for the boot dataset and its essential descendants.
 .Bl -tag -compact -offset 4n -width ".Sy keylocation Ns = Ns Sy https:// Ns Ar URL , Sy keylocation Ns = Ns Sy http:// Ns Ar URL"
 .It Sy keylocation Ns = Ns Sy prompt
 Is prompted for via
 .Nm systemd-ask-password
 thrice.
 .
 .It Sy keylocation Ns = Ns Sy https:// Ns Ar URL , Sy keylocation Ns = Ns Sy http:// Ns Ar URL
 .Pa network-online.target
 is started before loading.
 .
 .It Sy keylocation Ns = Ns Sy file:// Ns Ar path
 If
 .Ar path
 doesn't exist,
 .Nm udevadm No is Cm settle Ns d .
 If it still doesn't, it's waited for for up to
 .Sy 10 Ns s .
 .El
 .
 .It Pa zfs-env-bootfs.service Pq Nm systemd No service
 After pool import, sets
 .Ev BOOTFS Ns =
 in the systemd environment to the first non-null
 .Sy bootfs
 value in iteration order.
 .
 .It Pa dracut-zfs-generator Pq Nm systemd No generator
 Generates
 .Pa sysroot.mount Pq using Sy rootflags= , No if any .
 If an explicit boot dataset was specified, also generates essential mountpoints
 .Pq Pa sysroot-etc.mount , sysroot-bin.mount , No &c.\& ,
 otherwise generates
 .Pa zfs-nonroot-necessities.service
 which mounts them explicitly after
 .Pa /sysroot
 using
 .Ev BOOTFS Ns = .
 .
 .It Pa zfs-snapshot-bootfs.service , zfs-rollback-bootfs.service Pq Nm systemd No services
 Consume
 .Sy bootfs.snapshot
 and
 .Sy bootfs.rollback
 as described in
 .Sx CMDLINE  .
 Use
 .Ev BOOTFS Ns =
 if no explicit boot dataset was specified.
 .
 .It Pa zfs-needshutdown.sh Pq Sy cleanup
 If any pools were imported, signals that shutdown hooks are required.
 .
 .It Pa export-zfs.sh Pq Sy shutdown
 Forcibly exports all pools.
 .
 .It Pa /etc/hostid , /etc/zfs/zpool.cache , /etc/zfs/vdev_id.conf Pq regular files
 Included verbatim, hostonly.
 .
 .It Pa mount-zfs.sh Pq Sy mount
 Does nothing on
 .Nm systemd
 systems
 .Pq if Pa dracut-zfs-generator No succeeded .
 Otherwise, loads encryption key for the boot dataset from the console or via
 plymouth.
 It may not work at all!
 .El
 .
 .Sh SEE ALSO
 .Xr dracut.bootup 7 ,
 .Xr zfsprops 7 ,
 .Xr zpoolprops 7 ,
 .Xr dracut-shutdown.service 8 ,
 .Xr systemd-fstab-generator 8 ,
 .Xr systemd-gpt-auto-generator 8 ,
 .Xr zfs-mount-generator 8 ,
 .Xr zgenhostid 8
diff --git a/sys/contrib/openzfs/man/man7/vdevprops.7 b/sys/contrib/openzfs/man/man7/vdevprops.7
index acabe6b6613a..61e60d950416 100644
--- a/sys/contrib/openzfs/man/man7/vdevprops.7
+++ b/sys/contrib/openzfs/man/man7/vdevprops.7
@@ -1,201 +1,201 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2021 Klara, Inc.
 .\"
-.Dd October 30, 2022
+.Dd July 23, 2024
 .Dt VDEVPROPS 7
 .Os
 .
 .Sh NAME
 .Nm vdevprops
 .Nd native and user-defined properties of ZFS vdevs
 .
 .Sh DESCRIPTION
 Properties are divided into two types, native properties and user-defined
 .Pq or Qq user
 properties.
 Native properties either export internal statistics or control ZFS behavior.
 In addition, native properties are either editable or read-only.
 User properties have no effect on ZFS behavior, but you can use them to annotate
 vdevs in a way that is meaningful in your environment.
 For more information about user properties, see the
 .Sx User Properties
 section, below.
 .
 .Ss Native Properties
 Every vdev has a set of properties that export statistics about the vdev
 as well as control various behaviors.
 Properties are not inherited from top-level vdevs, with the exception of
 checksum_n, checksum_t, io_n, io_t, slow_io_n, and slow_io_t.
 .Pp
 The values of numeric properties can be specified using human-readable suffixes
 .Po for example,
 .Sy k , KB , M , Gb ,
 and so forth, up to
 .Sy Z
 for zettabyte
 .Pc .
 The following are all valid
 .Pq and equal
 specifications:
 .Li 1536M , 1.5g , 1.50GB .
 .Pp
 The values of non-numeric properties are case sensitive and must be lowercase.
 .Pp
 The following native properties consist of read-only statistics about the
 vdev.
 These properties can not be changed.
 .Bl -tag -width "fragmentation"
 .It Sy capacity
 Percentage of vdev space used
 .It Sy state
 state of this vdev such as online, faulted, or offline
 .It Sy guid
 globally unique id of this vdev
 .It Sy asize
 The allocatable size of this vdev
 .It Sy psize
 The physical size of this vdev
 .It Sy ashift
 The physical sector size of this vdev expressed as the power of two
 .It Sy size
 The total size of this vdev
 .It Sy free
 The amount of remaining free space on this vdev
 .It Sy allocated
 The amount of allocated space on this vdev
 .It Sy expandsize
 How much this vdev can expand by
 .It Sy fragmentation
 Percent of fragmentation in this vdev
 .It Sy parity
 The level of parity for this vdev
 .It Sy devid
 The device id for this vdev
 .It Sy physpath
 The physical path to the device
 .It Sy encpath
 The enclosure path to the device
 .It Sy fru
 Field Replaceable Unit, usually a model number
 .It Sy parent
 Parent of this vdev
 .It Sy children
 Comma separated list of children of this vdev
 .It Sy numchildren
 The number of children belonging to this vdev
 .It Sy read_errors , write_errors , checksum_errors , initialize_errors , trim_errors
 The number of errors of each type encountered by this vdev
 .It Sy slow_ios
 The number of slow I/Os encountered by this vdev,
 These represent I/O operations that didn't complete in
 .Sy zio_slow_io_ms
 milliseconds
 .Pq Sy 30000 No by default .
 .It Sy null_ops , read_ops , write_ops , free_ops , claim_ops , trim_ops
 The number of I/O operations of each type performed by this vdev
 .It Xo
 .Sy null_bytes , read_bytes , write_bytes , free_bytes , claim_bytes ,
 .Sy trim_bytes
 .Xc
 The cumulative size of all operations of each type performed by this vdev
 .It Sy removing
 If this device is currently being removed from the pool
 .It Sy trim_support
 Indicates if a leaf device supports trim operations.
 .El
 .Pp
 The following native properties can be used to change the behavior of a vdev.
 .Bl -tag -width "allocating"
 .It Sy checksum_n , checksum_t , io_n , io_t , slow_io_n , slow_io_t
 Tune the fault management daemon by specifying checksum/io thresholds of <N>
 errors in <T> seconds, respectively.
 These properties can be set on leaf and top-level vdevs.
 When the property is set on the leaf and top-level vdev, the value of the leaf
 vdev will be used.
 If the property is only set on the top-level vdev, this value will be used.
 The value of these properties do not persist across vdev replacement.
 For this reason, it is advisable to set the property on the top-level vdev -
 not on the leaf vdev itself.
 The default values for
 .Sy OpenZFS on Linux
 are 10 errors in 600 seconds.
 For
 .Sy OpenZFS on FreeBSD
 defaults see
 .Xr zfsd 8 .
 .It Sy comment
 A text comment up to 8192 characters long
 .It Sy bootsize
 The amount of space to reserve for the EFI system partition
 .It Sy failfast
 If this device should propagate BIO errors back to ZFS, used to disable
 failfast.
 .It Sy path
 The path to the device for this vdev
 .It Sy allocating
 If this device should perform new allocations, used to disable a device
 when it is scheduled for later removal.
 See
 .Xr zpool-remove 8 .
 .El
 .Ss User Properties
 In addition to the standard native properties, ZFS supports arbitrary user
 properties.
 User properties have no effect on ZFS behavior, but applications or
 administrators can use them to annotate vdevs.
 .Pp
 User property names must contain a colon
 .Pq Qq Sy \&:
 character to distinguish them from native properties.
 They may contain lowercase letters, numbers, and the following punctuation
 characters: colon
 .Pq Qq Sy \&: ,
 dash
 .Pq Qq Sy - ,
 period
 .Pq Qq Sy \&. ,
 and underscore
 .Pq Qq Sy _ .
 The expected convention is that the property name is divided into two portions
 such as
 .Ar module : Ns Ar property ,
 but this namespace is not enforced by ZFS.
 User property names can be at most 256 characters, and cannot begin with a dash
 .Pq Qq Sy - .
 .Pp
 When making programmatic use of user properties, it is strongly suggested to use
 a reversed DNS domain name for the
 .Ar module
 component of property names to reduce the chance that two
 independently-developed packages use the same property name for different
 purposes.
 .Pp
 The values of user properties are arbitrary strings and
 are never validated.
 Use the
 .Nm zpool Cm set
 command with a blank value to clear a user property.
 Property values are limited to 8192 bytes.
 .Sh SEE ALSO
 .Xr zpoolprops 7 ,
 .Xr zpool-set 8
diff --git a/sys/contrib/openzfs/man/man7/zfsconcepts.7 b/sys/contrib/openzfs/man/man7/zfsconcepts.7
index 5c736e53670d..bb2178d85bcd 100644
--- a/sys/contrib/openzfs/man/man7/zfsconcepts.7
+++ b/sys/contrib/openzfs/man/man7/zfsconcepts.7
@@ -1,246 +1,246 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\" Copyright 2023 Klara, Inc.
 .\"
-.Dd October 6, 2023
+.Dd October 2, 2024
 .Dt ZFSCONCEPTS 7
 .Os
 .
 .Sh NAME
 .Nm zfsconcepts
 .Nd overview of ZFS concepts
 .
 .Sh DESCRIPTION
 .Ss ZFS File System Hierarchy
 A ZFS storage pool is a logical collection of devices that provide space for
 datasets.
 A storage pool is also the root of the ZFS file system hierarchy.
 .Pp
 The root of the pool can be accessed as a file system, such as mounting and
 unmounting, taking snapshots, and setting properties.
 The physical storage characteristics, however, are managed by the
 .Xr zpool 8
 command.
 .Pp
 See
 .Xr zpool 8
 for more information on creating and administering pools.
 .Ss Snapshots
 A snapshot is a read-only copy of a file system or volume.
 Snapshots can be created extremely quickly, and initially consume no additional
 space within the pool.
 As data within the active dataset changes, the snapshot consumes more data than
 would otherwise be shared with the active dataset.
 .Pp
 Snapshots can have arbitrary names.
 Snapshots of volumes can be cloned or rolled back, visibility is determined
 by the
 .Sy snapdev
 property of the parent volume.
 .Pp
 File system snapshots can be accessed under the
 .Pa .zfs/snapshot
 directory in the root of the file system.
 Snapshots are automatically mounted on demand and may be unmounted at regular
 intervals.
 The availability and visibility of the
 .Pa .zfs
 directory can be controlled by the
 .Sy snapdir
 property.
 .Ss Bookmarks
 A bookmark is like a snapshot, a read-only copy of a file system or volume.
 Bookmarks can be created extremely quickly, compared to snapshots, and they
 consume no additional space within the pool.
 Bookmarks can also have arbitrary names, much like snapshots.
 .Pp
 Unlike snapshots, bookmarks can not be accessed through the filesystem in any
 way.
 From a storage standpoint a bookmark just provides a way to reference
 when a snapshot was created as a distinct object.
 Bookmarks are initially tied to a snapshot, not the filesystem or volume,
 and they will survive if the snapshot itself is destroyed.
 Since they are very light weight there's little incentive to destroy them.
 .Ss Clones
 A clone is a writable volume or file system whose initial contents are the same
 as another dataset.
 As with snapshots, creating a clone is nearly instantaneous, and initially
 consumes no additional space.
 .Pp
 Clones can only be created from a snapshot.
 When a snapshot is cloned, it creates an implicit dependency between the parent
 and child.
 Even though the clone is created somewhere else in the dataset hierarchy, the
 original snapshot cannot be destroyed as long as a clone exists.
 The
 .Sy origin
 property exposes this dependency, and the
 .Cm destroy
 command lists any such dependencies, if they exist.
 .Pp
 The clone parent-child dependency relationship can be reversed by using the
 .Cm promote
 subcommand.
 This causes the
 .Qq origin
 file system to become a clone of the specified file system, which makes it
 possible to destroy the file system that the clone was created from.
 .Ss "Mount Points"
 Creating a ZFS file system is a simple operation, so the number of file systems
 per system is likely to be numerous.
 To cope with this, ZFS automatically manages mounting and unmounting file
 systems without the need to edit the
 .Pa /etc/fstab
 file.
 All automatically managed file systems are mounted by ZFS at boot time.
 .Pp
 By default, file systems are mounted under
 .Pa /path ,
 where
 .Ar path
 is the name of the file system in the ZFS namespace.
 Directories are created and destroyed as needed.
 .Pp
 A file system can also have a mount point set in the
 .Sy mountpoint
 property.
 This directory is created as needed, and ZFS automatically mounts the file
 system when the
 .Nm zfs Cm mount Fl a
 command is invoked
 .Po without editing
 .Pa /etc/fstab
 .Pc .
 The
 .Sy mountpoint
 property can be inherited, so if
 .Em pool/home
 has a mount point of
 .Pa /export/stuff ,
 then
 .Em pool/home/user
 automatically inherits a mount point of
 .Pa /export/stuff/user .
 .Pp
 A file system
 .Sy mountpoint
 property of
 .Sy none
 prevents the file system from being mounted.
 .Pp
 If needed, ZFS file systems can also be managed with traditional tools
 .Po
 .Nm mount ,
 .Nm umount ,
 .Pa /etc/fstab
 .Pc .
 If a file system's mount point is set to
 .Sy legacy ,
 ZFS makes no attempt to manage the file system, and the administrator is
 responsible for mounting and unmounting the file system.
 Because pools must
 be imported before a legacy mount can succeed, administrators should ensure
 that legacy mounts are only attempted after the zpool import process
 finishes at boot time.
 For example, on machines using systemd, the mount option
 .Pp
 .Nm x-systemd.requires=zfs-import.target
 .Pp
 will ensure that the zfs-import completes before systemd attempts mounting
 the filesystem.
 See
 .Xr systemd.mount 5
 for details.
 .Ss Deduplication
 Deduplication is the process for removing redundant data at the block level,
 reducing the total amount of data stored.
 If a file system has the
 .Sy dedup
 property enabled, duplicate data blocks are removed synchronously.
 The result
 is that only unique data is stored and common components are shared among files.
 .Pp
 Deduplicating data is a very resource-intensive operation.
 It is generally recommended that you have at least 1.25 GiB of RAM
 per 1 TiB of storage when you enable deduplication.
 Calculating the exact requirement depends heavily
 on the type of data stored in the pool.
 .Pp
 Enabling deduplication on an improperly-designed system can result in
 performance issues (slow I/O and administrative operations).
 It can potentially lead to problems importing a pool due to memory exhaustion.
 Deduplication can consume significant processing power (CPU) and memory as well
 as generate additional disk I/O.
 .Pp
 Before creating a pool with deduplication enabled, ensure that you have planned
 your hardware requirements appropriately and implemented appropriate recovery
 practices, such as regular backups.
 Consider using the
 .Sy compression
 property as a less resource-intensive alternative.
 .Ss Block cloning
 Block cloning is a facility that allows a file (or parts of a file) to be
 .Qq cloned ,
 that is, a shallow copy made where the existing data blocks are referenced
 rather than copied.
 Later modifications to the data will cause a copy of the data block to be taken
 and that copy modified.
 This facility is used to implement
 .Qq reflinks
 or
 .Qq file-level copy-on-write .
 .Pp
 Cloned blocks are tracked in a special on-disk structure called the Block
 Reference Table
 .Po BRT
 .Pc .
 Unlike deduplication, this table has minimal overhead, so can be enabled at all
 times.
 .Pp
 Also unlike deduplication, cloning must be requested by a user program.
 Many common file copying programs, including newer versions of
 .Nm /bin/cp ,
 will try to create clones automatically.
 Look for
 .Qq clone ,
 .Qq dedupe
 or
 .Qq reflink
 in the documentation for more information.
 .Pp
 There are some limitations to block cloning.
 Only whole blocks can be cloned, and blocks can not be cloned if they are not
 yet written to disk, or if they are encrypted, or the source and destination
 .Sy recordsize
 properties differ.
 The OS may add additional restrictions;
 for example, most versions of Linux will not allow clones across datasets.
diff --git a/sys/contrib/openzfs/man/man7/zfsprops.7 b/sys/contrib/openzfs/man/man7/zfsprops.7
index ac3152cb5d51..0930771c9fce 100644
--- a/sys/contrib/openzfs/man/man7/zfsprops.7
+++ b/sys/contrib/openzfs/man/man7/zfsprops.7
@@ -1,2284 +1,2284 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
 .\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
 .\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
 .\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
 .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\" Copyright (c) 2019, Kjeld Schouten-Lebbing
 .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
 .\"
-.Dd June 29, 2024
+.Dd August 6, 2025
 .Dt ZFSPROPS 7
 .Os
 .
 .Sh NAME
 .Nm zfsprops
 .Nd native and user-defined properties of ZFS datasets
 .
 .Sh DESCRIPTION
 Properties are divided into two types, native properties and user-defined
 .Po or
 .Qq user
 .Pc
 properties.
 Native properties either export internal statistics or control ZFS behavior.
 In addition, native properties are either editable or read-only.
 User properties have no effect on ZFS behavior, but you can use them to annotate
 datasets in a way that is meaningful in your environment.
 For more information about user properties, see the
 .Sx User Properties
 section, below.
 .
 .Ss Native Properties
 Every dataset has a set of properties that export statistics about the dataset
 as well as control various behaviors.
 Properties are inherited from the parent unless overridden by the child.
 Some properties apply only to certain types of datasets
 .Pq file systems, volumes, or snapshots .
 .Pp
 The values of numeric properties can be specified using human-readable suffixes
 .Po for example,
 .Sy k ,
 .Sy KB ,
 .Sy M ,
 .Sy Gb ,
 and so forth, up to
 .Sy Z
 for zettabyte
 .Pc .
 The following are all valid
 .Pq and equal
 specifications:
 .Li 1536M ,
 .Li 1.5g ,
 .Li 1.50GB .
 .Pp
 The values of non-numeric properties are case sensitive and must be lowercase,
 except for
 .Sy mountpoint ,
 .Sy sharenfs ,
 and
 .Sy sharesmb .
 .Pp
 The following native properties consist of read-only statistics about the
 dataset.
 These properties can be neither set, nor inherited.
 Native properties apply to all dataset types unless otherwise noted.
 .Bl -tag -width "usedbyrefreservation"
 .It Sy available
 The amount of space available to the dataset and all its children, assuming that
 there is no other activity in the pool.
 Because space is shared within a pool, availability can be limited by any number
 of factors, including physical pool size, quotas, reservations, or other
 datasets within the pool.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy avail .
 .It Sy compressratio
 For non-snapshots, the compression ratio achieved for the
 .Sy used
 space of this dataset, expressed as a multiplier.
 The
 .Sy used
 property includes descendant datasets, and, for clones, does not include the
 space shared with the origin snapshot.
 For snapshots, the
 .Sy compressratio
 is the same as the
 .Sy refcompressratio
 property.
 Compression can be turned on by running:
 .Nm zfs Cm set Sy compression Ns = Ns Sy on Ar dataset .
 The default value is
 .Sy off .
 .It Sy createtxg
 The transaction group (txg) in which the dataset was created.
 Bookmarks have the same
 .Sy createtxg
 as the snapshot they are initially tied to.
 This property is suitable for ordering a list of snapshots,
 e.g. for incremental send and receive.
 .It Sy creation
 The time this dataset was created.
 .It Sy clones
 For snapshots, this property is a comma-separated list of filesystems or volumes
 which are clones of this snapshot.
 The clones'
 .Sy origin
 property is this snapshot.
 If the
 .Sy clones
 property is not empty, then this snapshot can not be destroyed
 .Po even with the
 .Fl r
 or
 .Fl f
 options
 .Pc .
 The roles of origin and clone can be swapped by promoting the clone with the
 .Nm zfs Cm promote
 command.
 .It Sy defer_destroy
 This property is
 .Sy on
 if the snapshot has been marked for deferred destroy by using the
 .Nm zfs Cm destroy Fl d
 command.
 Otherwise, the property is
 .Sy off .
 .It Sy encryptionroot
 For encrypted datasets, indicates where the dataset is currently inheriting its
 encryption key from.
 Loading or unloading a key for the
 .Sy encryptionroot
 will implicitly load / unload the key for any inheriting datasets (see
 .Nm zfs Cm load-key
 and
 .Nm zfs Cm unload-key
 for details).
 Clones will always share an
 encryption key with their origin.
 See the
 .Sx Encryption
 section of
 .Xr zfs-load-key 8
 for details.
 .It Sy filesystem_count
 The total number of filesystems and volumes that exist under this location in
 the dataset tree.
 This value is only available when a
 .Sy filesystem_limit
 has been set somewhere in the tree under which the dataset resides.
 .It Sy keystatus
 Indicates if an encryption key is currently loaded into ZFS.
 The possible values are
 .Sy none ,
 .Sy available ,
 and
 .Sy unavailable .
 See
 .Nm zfs Cm load-key
 and
 .Nm zfs Cm unload-key .
 .It Sy guid
 The 64 bit GUID of this dataset or bookmark which does not change over its
 entire lifetime.
 When a snapshot is sent to another pool, the received snapshot has the same
 GUID.
 Thus, the
 .Sy guid
 is suitable to identify a snapshot across pools.
 .It Sy logicalreferenced
 The amount of space that is
 .Qq logically
 accessible by this dataset.
 See the
 .Sy referenced
 property.
 The logical space ignores the effect of the
 .Sy compression
 and
 .Sy copies
 properties, giving a quantity closer to the amount of data that applications
 see.
 However, it does include space consumed by metadata.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy lrefer .
 .It Sy logicalused
 The amount of space that is
 .Qq logically
 consumed by this dataset and all its descendants.
 See the
 .Sy used
 property.
 The logical space ignores the effect of the
 .Sy compression
 and
 .Sy copies
 properties, giving a quantity closer to the amount of data that applications
 see.
 However, it does include space consumed by metadata.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy lused .
 .It Sy mounted
 For file systems, indicates whether the file system is currently mounted.
 This property can be either
 .Sy yes
 or
 .Sy no .
 .It Sy objsetid
 A unique identifier for this dataset within the pool.
 Unlike the dataset's
 .Sy guid , No the Sy objsetid
 of a dataset is not transferred to other pools when the snapshot is copied
 with a send/receive operation.
 The
 .Sy objsetid
 can be reused (for a new dataset) after the dataset is deleted.
 .It Sy origin
 For cloned file systems or volumes, the snapshot from which the clone was
 created.
 See also the
 .Sy clones
 property.
 .It Sy receive_resume_token
 For filesystems or volumes which have saved partially-completed state from
 .Nm zfs Cm receive Fl s ,
 this opaque token can be provided to
 .Nm zfs Cm send Fl t
 to resume and complete the
 .Nm zfs Cm receive .
 .It Sy redact_snaps
 For bookmarks, this is the list of snapshot GUIDs the bookmark contains a
 redaction
 list for.
 For snapshots, this is the list of snapshot GUIDs the snapshot is redacted with
 respect to.
 .It Sy referenced
 The amount of data that is accessible by this dataset, which may or may not be
 shared with other datasets in the pool.
 When a snapshot or clone is created, it initially references the same amount of
 space as the file system or snapshot it was created from, since its contents are
 identical.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy refer .
 .It Sy refcompressratio
 The compression ratio achieved for the
 .Sy referenced
 space of this dataset, expressed as a multiplier.
 See also the
 .Sy compressratio
 property.
 .It Sy snapshot_count
 The total number of snapshots that exist under this location in the dataset
 tree.
 This value is only available when a
 .Sy snapshot_limit
 has been set somewhere in the tree under which the dataset resides.
 .It Sy type
 The type of dataset:
 .Sy filesystem ,
 .Sy volume ,
 .Sy snapshot ,
 or
 .Sy bookmark .
 .It Sy used
 The amount of space consumed by this dataset and all its descendants.
 This is the value that is checked against this dataset's quota and reservation.
 The space used does not include this dataset's reservation, but does take into
 account the reservations of any descendent datasets.
 The amount of space that a dataset consumes from its parent, as well as the
 amount of space that is freed if this dataset is recursively destroyed, is the
 greater of its space used and its reservation.
 .Pp
 The used space of a snapshot
 .Po see the
 .Sx Snapshots
 section of
 .Xr zfsconcepts 7
 .Pc
 is space that is referenced exclusively by this snapshot.
 If this snapshot is destroyed, the amount of
 .Sy used
 space will be freed.
 Space that is shared by multiple snapshots isn't accounted for in this metric.
 When a snapshot is destroyed, space that was previously shared with this
 snapshot can become unique to snapshots adjacent to it, thus changing the used
 space of those snapshots.
 The used space of the latest snapshot can also be affected by changes in the
 file system.
 Note that the
 .Sy used
 space of a snapshot is a subset of the
 .Sy written
 space of the snapshot.
 .Pp
 The amount of space used, available, or referenced does not take into account
 pending changes.
 Pending changes are generally accounted for within a few seconds.
 Committing a change to a disk using
 .Xr fsync 2
 or
 .Sy O_SYNC
 does not necessarily guarantee that the space usage information is updated
 immediately.
 .It Sy usedby*
 The
 .Sy usedby*
 properties decompose the
 .Sy used
 properties into the various reasons that space is used.
 Specifically,
 .Sy used No =
 .Sy usedbychildren No +
 .Sy usedbydataset No +
 .Sy usedbyrefreservation No +
 .Sy usedbysnapshots .
 These properties are only available for datasets created on
 .Nm zpool
 .Qo version 13 Qc
 pools.
 .It Sy usedbychildren
 The amount of space used by children of this dataset, which would be freed if
 all the dataset's children were destroyed.
 .It Sy usedbydataset
 The amount of space used by this dataset itself, which would be freed if the
 dataset were destroyed
 .Po after first removing any
 .Sy refreservation
 and destroying any necessary snapshots or descendants
 .Pc .
 .It Sy usedbyrefreservation
 The amount of space used by a
 .Sy refreservation
 set on this dataset, which would be freed if the
 .Sy refreservation
 was removed.
 .It Sy usedbysnapshots
 The amount of space consumed by snapshots of this dataset.
 In particular, it is the amount of space that would be freed if all of this
 dataset's snapshots were destroyed.
 Note that this is not simply the sum of the snapshots'
 .Sy used
 properties because space can be shared by multiple snapshots.
 .It Sy userused Ns @ Ns Ar user
 The amount of space consumed by the specified user in this dataset.
 Space is charged to the owner of each file, as displayed by
 .Nm ls Fl l .
 The amount of space charged is displayed by
 .Nm du No and Nm ls Fl s .
 See the
 .Nm zfs Cm userspace
 command for more information.
 .Pp
 Unprivileged users can access only their own space usage.
 The root user, or a user who has been granted the
 .Sy userused
 privilege with
 .Nm zfs Cm allow ,
 can access everyone's usage.
 .Pp
 The
 .Sy userused Ns @ Ns Ar …
 properties are not displayed by
 .Nm zfs Cm get Sy all .
 The user's name must be appended after the
 .Sy @
 symbol, using one of the following forms:
 .Bl -bullet -compact -offset 4n
 .It
 POSIX name
 .Pq Qq joe
 .It
 POSIX numeric ID
 .Pq Qq 789
 .It
 SID name
 .Pq Qq joe.smith@mydomain
 .It
 SID numeric ID
 .Pq Qq S-1-123-456-789
 .El
 .Pp
 Files created on Linux always have POSIX owners.
 .It Sy userobjused Ns @ Ns Ar user
 The
 .Sy userobjused
 property is similar to
 .Sy userused
 but instead it counts the number of objects consumed by a user.
 This property counts all objects allocated on behalf of the user,
 it may differ from the results of system tools such as
 .Nm df Fl i .
 .Pp
 When the property
 .Sy xattr Ns = Ns Sy on
 is set on a file system additional objects will be created per-file to store
 extended attributes.
 These additional objects are reflected in the
 .Sy userobjused
 value and are counted against the user's
 .Sy userobjquota .
 When a file system is configured to use
 .Sy xattr Ns = Ns Sy sa
 no additional internal objects are normally required.
 .It Sy userrefs
 This property is set to the number of user holds on this snapshot.
 User holds are set by using the
 .Nm zfs Cm hold
 command.
 .It Sy groupused Ns @ Ns Ar group
 The amount of space consumed by the specified group in this dataset.
 Space is charged to the group of each file, as displayed by
 .Nm ls Fl l .
 See the
 .Sy userused Ns @ Ns Ar user
 property for more information.
 .Pp
 Unprivileged users can only access their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy groupused
 privilege with
 .Nm zfs Cm allow ,
 can access all groups' usage.
 .It Sy groupobjused Ns @ Ns Ar group
 The number of objects consumed by the specified group in this dataset.
 Multiple objects may be charged to the group for each file when extended
 attributes are in use.
 See the
 .Sy userobjused Ns @ Ns Ar user
 property for more information.
 .Pp
 Unprivileged users can only access their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy groupobjused
 privilege with
 .Nm zfs Cm allow ,
 can access all groups' usage.
 .It Sy projectused Ns @ Ns Ar project
 The amount of space consumed by the specified project in this dataset.
 Project is identified via the project identifier (ID) that is object-based
 numeral attribute.
 An object can inherit the project ID from its parent object (if the
 parent has the flag of inherit project ID that can be set and changed via
 .Nm chattr Fl /+P
 or
 .Nm zfs project Fl s )
 when being created.
 The privileged user can set and change object's project
 ID via
 .Nm chattr Fl p
 or
 .Nm zfs project Fl s
 anytime.
 Space is charged to the project of each file, as displayed by
 .Nm lsattr Fl p
 or
 .Nm zfs project .
 See the
 .Sy userused Ns @ Ns Ar user
 property for more information.
 .Pp
 The root user, or a user who has been granted the
 .Sy projectused
 privilege with
 .Nm zfs allow ,
 can access all projects' usage.
 .It Sy projectobjused Ns @ Ns Ar project
 The
 .Sy projectobjused
 is similar to
 .Sy projectused
 but instead it counts the number of objects consumed by project.
 When the property
 .Sy xattr Ns = Ns Sy on
 is set on a fileset, ZFS will create additional objects per-file to store
 extended attributes.
 These additional objects are reflected in the
 .Sy projectobjused
 value and are counted against the project's
 .Sy projectobjquota .
 When a filesystem is configured to use
 .Sy xattr Ns = Ns Sy sa
 no additional internal objects are required.
 See the
 .Sy userobjused Ns @ Ns Ar user
 property for more information.
 .Pp
 The root user, or a user who has been granted the
 .Sy projectobjused
 privilege with
 .Nm zfs allow ,
 can access all projects' objects usage.
 .It Sy snapshots_changed
 Provides a mechanism to quickly determine whether snapshot list has
 changed without having to mount a dataset or iterate the snapshot list.
 Specifies the time at which a snapshot for a dataset was last
 created or deleted.
 .Pp
 This allows us to be more efficient how often we query snapshots.
 The property is persistent across mount and unmount operations only if the
 .Sy extensible_dataset
 feature is enabled.
 .It Sy volblocksize
 For volumes, specifies the block size of the volume.
 The
 .Sy blocksize
 cannot be changed once the volume has been written, so it should be set at
 volume creation time.
 The size specified must be a power of two greater than or equal to
 .Ar 512 B
 and less than or equal to
 .Ar 128 KiB .
 If the
 .Sy large_blocks
 feature is enabled on the pool, the size may be up to
 .Ar 16 MiB .
 The default size is
 .Ar 16 KiB .
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy volblock .
 .It Sy written
 The amount of space
 .Sy referenced
 by this dataset, that was written since the previous snapshot
 .Pq i.e. that is not referenced by the previous snapshot .
 .It Sy written Ns @ Ns Ar snapshot
 The amount of
 .Sy referenced
 space written to this dataset since the specified snapshot.
 This is the space that is referenced by this dataset but was not referenced by
 the specified snapshot.
 .Pp
 The
 .Ar snapshot
 may be specified as a short snapshot name
 .Pq just the part after the Sy @ ,
 in which case it will be interpreted as a snapshot in the same filesystem as
 this dataset.
 The
 .Ar snapshot
 may be a full snapshot name
 .Pq Ar filesystem Ns @ Ns Ar snapshot ,
 which for clones may be a snapshot in the origin's filesystem
 .Pq or the origin of the origin's filesystem, etc.
 .El
 .Pp
 The following native properties can be used to change the behavior of a ZFS
 dataset.
 .Bl -tag -width ""
 .It Xo
 .Sy aclinherit Ns = Ns Sy discard Ns | Ns Sy noallow Ns | Ns
 .Sy restricted Ns | Ns Sy passthrough Ns | Ns Sy passthrough-x
 .Xc
 Controls how ACEs are inherited when files and directories are created.
 .Bl -tag -compact -offset 4n -width "passthrough-x"
 .It Sy discard
 does not inherit any ACEs.
 .It Sy noallow
 only inherits inheritable ACEs that specify
 .Qq deny
 permissions.
 .It Sy restricted
 default, removes the
 .Sy write_acl
 and
 .Sy write_owner
 permissions when the ACE is inherited.
 .It Sy passthrough
 inherits all inheritable ACEs without any modifications.
 .It Sy passthrough-x
 same meaning as
 .Sy passthrough ,
 except that the
 .Sy owner@ , group@ , No and Sy everyone@
 ACEs inherit the execute permission only if the file creation mode also requests
 the execute bit.
 .El
 .Pp
 When the property value is set to
 .Sy passthrough ,
 files are created with a mode determined by the inheritable ACEs.
 If no inheritable ACEs exist that affect the mode, then the mode is set in
 accordance to the requested mode from the application.
 .Pp
 The
 .Sy aclinherit
 property does not apply to POSIX ACLs.
 .It Xo
 .Sy aclmode Ns = Ns Sy discard Ns | Ns Sy groupmask Ns | Ns
 .Sy passthrough Ns | Ns Sy restricted Ns
 .Xc
 Controls how an ACL is modified during chmod(2) and how inherited ACEs
 are modified by the file creation mode:
 .Bl -tag -compact -offset 4n -width "passthrough"
 .It Sy discard
 default, deletes all
 .Sy ACEs
 except for those representing
 the mode of the file or directory requested by
 .Xr chmod 2 .
 .It Sy groupmask
 reduces permissions granted in all
 .Sy ALLOW
 entries found in the
 .Sy ACL
 such that they are no greater than the group permissions specified by
 .Xr chmod 2 .
 .It Sy passthrough
 indicates that no changes are made to the ACL other than creating or updating
 the necessary ACL entries to represent the new mode of the file or directory.
 .It Sy restricted
 will cause the
 .Xr chmod 2
 operation to return an error when used on any file or directory which has
 a non-trivial ACL whose entries can not be represented by a mode.
 .Xr chmod 2
 is required to change the set user ID, set group ID, or sticky bits on a file
 or directory, as they do not have equivalent ACL entries.
 In order to use
 .Xr chmod 2
 on a file or directory with a non-trivial ACL when
 .Sy aclmode
 is set to
 .Sy restricted ,
 you must first remove all ACL entries which do not represent the current mode.
 .El
 .It Sy acltype Ns = Ns Sy off Ns | Ns Sy nfsv4 Ns | Ns Sy posix
 Controls whether ACLs are enabled and if so what type of ACL to use.
 When this property is set to a type of ACL not supported by the current
 platform, the behavior is the same as if it were set to
 .Sy off .
 .Bl -tag -compact -offset 4n -width "posixacl"
 .It Sy off
 default on Linux, when a file system has the
 .Sy acltype
 property set to off then ACLs are disabled.
 .It Sy noacl
 an alias for
 .Sy off
 .It Sy nfsv4
 default on
 .Fx ,
 indicates that NFSv4-style ZFS ACLs should be used.
 These ACLs can be managed with the
 .Xr getfacl 1
 and
 .Xr setfacl 1 .
 The
 .Sy nfsv4
 ZFS ACL type is not yet supported on Linux.
 .It Sy posix
 indicates POSIX ACLs should be used.
 POSIX ACLs are specific to Linux and are not functional on other platforms.
 POSIX ACLs are stored as an extended
 attribute and therefore will not overwrite any existing NFSv4 ACLs which
 may be set.
 .It Sy posixacl
 an alias for
 .Sy posix
 .El
 .Pp
 To obtain the best performance when setting
 .Sy posix
 users are strongly encouraged to set the
 .Sy xattr Ns = Ns Sy sa
 property.
 This will result in the POSIX ACL being stored more efficiently on disk.
 But as a consequence, all new extended attributes will only be
 accessible from OpenZFS implementations which support the
 .Sy xattr Ns = Ns Sy sa
 property.
 See the
 .Sy xattr
 property for more details.
 .It Sy atime Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the access time for files is updated when they are read.
 Turning this property off avoids producing write traffic when reading files and
 can result in significant performance gains, though it might confuse mailers
 and other similar utilities.
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy atime
 and
 .Sy noatime
 mount options.
 The default value is
 .Sy on .
 See also
 .Sy relatime
 below.
 .It Sy canmount Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy noauto
 If this property is set to
 .Sy off ,
 the file system cannot be mounted, and is ignored by
 .Nm zfs Cm mount Fl a .
 Setting this property to
 .Sy off
 is similar to setting the
 .Sy mountpoint
 property to
 .Sy none ,
 except that the dataset still has a normal
 .Sy mountpoint
 property, which can be inherited.
 Setting this property to
 .Sy off
 allows datasets to be used solely as a mechanism to inherit properties.
 One example of setting
 .Sy canmount Ns = Ns Sy off
 is to have two datasets with the same
 .Sy mountpoint ,
 so that the children of both datasets appear in the same directory, but might
 have different inherited characteristics.
 .Pp
 When set to
 .Sy noauto ,
 a dataset can only be mounted and unmounted explicitly.
 The dataset is not mounted automatically when the dataset is created or
 imported, nor is it mounted by the
 .Nm zfs Cm mount Fl a
 command or unmounted by the
 .Nm zfs Cm unmount Fl a
 command.
 .Pp
 This property is not inherited.
 .It Xo
 .Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns
 .Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns
 .Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr Ns | Ns Sy blake3
 .Xc
 Controls the checksum used to verify data integrity.
 The default value is
 .Sy on ,
 which automatically selects an appropriate algorithm
 .Po currently,
 .Sy fletcher4 ,
 but this may change in future releases
 .Pc .
 The value
 .Sy off
 disables integrity checking on user data.
 The value
 .Sy noparity
 not only disables integrity but also disables maintaining parity for user data.
 This setting is used internally by a dump device residing on a RAID-Z pool and
 should not be used by any other dataset.
 Disabling checksums is
 .Em NOT
 a recommended practice.
 .Pp
 The
 .Sy sha512 ,
 .Sy skein ,
 .Sy edonr ,
 and
 .Sy blake3
 checksum algorithms require enabling the appropriate features on the pool.
 .Pp
 Please see
 .Xr zpool-features 7
 for more information on these algorithms.
 .Pp
 Changing this property affects only newly-written data.
 .It Xo
 .Sy compression Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy gzip Ns | Ns
 .Sy gzip- Ns Ar N Ns | Ns Sy lz4 Ns | Ns Sy lzjb Ns | Ns Sy zle Ns | Ns Sy zstd Ns | Ns
 .Sy zstd- Ns Ar N Ns | Ns Sy zstd-fast Ns | Ns Sy zstd-fast- Ns Ar N
 .Xc
 Controls the compression algorithm used for this dataset.
 .Pp
 When set to
 .Sy on
 (the default), indicates that the current default compression algorithm should
 be used.
 The default balances compression and decompression speed, with compression ratio
 and is expected to work well on a wide variety of workloads.
 Unlike all other settings for this property,
 .Sy on
 does not select a fixed compression type.
 As new compression algorithms are added to ZFS and enabled on a pool, the
 default compression algorithm may change.
 The current default compression algorithm is either
 .Sy lzjb
 or, if the
 .Sy lz4_compress
 feature is enabled,
 .Sy lz4 .
 .Pp
 The
 .Sy lz4
 compression algorithm is a high-performance replacement for the
 .Sy lzjb
 algorithm.
 It features significantly faster compression and decompression, as well as a
 moderately higher compression ratio than
 .Sy lzjb ,
 but can only be used on pools with the
 .Sy lz4_compress
 feature set to
 .Sy enabled .
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy lz4_compress
 feature.
 .Pp
 The
 .Sy lzjb
 compression algorithm is optimized for performance while providing decent data
 compression.
 .Pp
 The
 .Sy gzip
 compression algorithm uses the same compression as the
 .Xr gzip 1
 command.
 You can specify the
 .Sy gzip
 level by using the value
 .Sy gzip- Ns Ar N ,
 where
 .Ar N
 is an integer from 1
 .Pq fastest
 to 9
 .Pq best compression ratio .
 Currently,
 .Sy gzip
 is equivalent to
 .Sy gzip-6
 .Po which is also the default for
 .Xr gzip 1
 .Pc .
 .Pp
 The
 .Sy zstd
 compression algorithm provides both high compression ratios and good
 performance.
 You can specify the
 .Sy zstd
 level by using the value
 .Sy zstd- Ns Ar N ,
 where
 .Ar N
 is an integer from 1
 .Pq fastest
 to 19
 .Pq best compression ratio .
 .Sy zstd
 is equivalent to
 .Sy zstd-3 .
 .Pp
 Faster speeds at the cost of the compression ratio can be requested by
 setting a negative
 .Sy zstd
 level.
 This is done using
 .Sy zstd-fast- Ns Ar N ,
 where
 .Ar N
 is an integer in
 .Bq Sy 1 Ns - Ns Sy 10 , 20 , 30 , No … , Sy 100 , 500 , 1000
 which maps to a negative
 .Sy zstd
 level.
 The lower the level the faster the compression \(em
 .Sy 1000
 provides the fastest compression and lowest compression ratio.
 .Sy zstd-fast
 is equivalent to
 .Sy zstd-fast- Ns Ar 1 .
 .Pp
 The
 .Sy zle
 compression algorithm compresses runs of zeros.
 .Pp
 This property can also be referred to by its shortened column name
 .Sy compress .
 Changing this property affects only newly-written data.
 .Pp
 When any setting except
 .Sy off
 is selected, compression will explicitly check for blocks consisting of only
 zeroes (the NUL byte).
 When a zero-filled block is detected, it is stored as
 a hole and not compressed using the indicated compression algorithm.
 .Pp
 All blocks are allocated as a whole number of sectors
 .Pq chunks of 2^ Ns Sy ashift No bytes , e.g . Sy 512B No or Sy 4KB .
 Compression may result in a non-sector-aligned size, which will be rounded up
 to a whole number of sectors.
 If compression saves less than one whole sector,
 the block will be stored uncompressed.
 Therefore, blocks whose logical size is a small number of sectors will
 experience less compression
 (e.g. for
 .Sy recordsize Ns = Ns Sy 16K
 with
 .Sy 4K
 sectors, which have 4 sectors per block,
 compression needs to save at least 25% to actually save space on disk).
 .Pp
 There is
 .Sy 12.5%
 default compression threshold in addition to sector rounding.
 .It Xo
 .Sy context Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux context for all files in the file system under
 a mount point for that file system.
 See
 .Xr selinux 8
 for more information.
 .It Xo
 .Sy fscontext Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux context for the file system file system being
 mounted.
 See
 .Xr selinux 8
 for more information.
 .It Xo
 .Sy defcontext Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux default context for unlabeled files.
 See
 .Xr selinux 8
 for more information.
 .It Xo
 .Sy rootcontext Ns = Ns Sy none Ns | Ns
 .Ar SELinux-User : Ns Ar SELinux-Role : Ns Ar SELinux-Type : Ns Ar Sensitivity-Level
 .Xc
 This flag sets the SELinux context for the root inode of the file system.
 See
 .Xr selinux 8
 for more information.
 .It Sy copies Ns = Ns Sy 1 Ns | Ns Sy 2 Ns | Ns Sy 3
 Controls the number of copies of data stored for this dataset.
 These copies are in addition to any redundancy provided by the pool, for
 example, mirroring or RAID-Z.
 The copies are stored on different disks, if possible.
 The space used by multiple copies is charged to the associated file and dataset,
 changing the
 .Sy used
 property and counting against quotas and reservations.
 .Pp
 Changing this property only affects newly-written data.
 Therefore, set this property at file system creation time by using the
 .Fl o Sy copies Ns = Ns Ar N
 option.
 .Pp
 Remember that ZFS will not import a pool with a missing top-level vdev.
 Do
 .Em NOT
 create, for example a two-disk striped pool and set
 .Sy copies Ns = Ns Ar 2
 on some datasets thinking you have setup redundancy for them.
 When a disk fails you will not be able to import the pool
 and will have lost all of your data.
 .Pp
 Encrypted datasets may not have
 .Sy copies Ns = Ns Ar 3
 since the implementation stores some encryption metadata where the third copy
 would normally be.
 .It Sy devices Ns = Ns Sy on Ns | Ns Sy off
 Controls whether device nodes can be opened on this file system.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy dev
 and
 .Sy nodev
 mount options.
 .It Xo
 .Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns
 .Sy sha256 Ns Oo , Ns Sy verify Oc Ns | Ns Sy sha512 Ns Oo , Ns Sy verify Oc Ns | Ns Sy skein Ns Oo , Ns Sy verify Oc Ns | Ns
 .Sy edonr , Ns Sy verify Ns | Ns Sy blake3 Ns Oo , Ns Sy verify Oc Ns
 .Xc
 Configures deduplication for a dataset.
 The default value is
 .Sy off .
 The default deduplication checksum is
 .Sy sha256
 (this may change in the future).
 When
 .Sy dedup
 is enabled, the checksum defined here overrides the
 .Sy checksum
 property.
 Setting the value to
 .Sy verify
 has the same effect as the setting
 .Sy sha256 , Ns Sy verify .
 .Pp
 If set to
 .Sy verify ,
 ZFS will do a byte-to-byte comparison in case of two blocks having the same
 signature to make sure the block contents are identical.
 Specifying
 .Sy verify
 is mandatory for the
 .Sy edonr
 algorithm.
 .Pp
 Unless necessary, deduplication should
 .Em not
 be enabled on a system.
 See the
 .Sx Deduplication
 section of
 .Xr zfsconcepts 7 .
 .It Xo
 .Sy direct Ns = Ns Sy disabled Ns | Ns Sy standard Ns | Ns Sy always
 .Xc
 Controls the behavior of Direct I/O requests
 .Pq e.g. Dv O_DIRECT .
 The
 .Sy standard
 behavior for Direct I/O requests is to bypass the ARC when possible.
 These requests will not be cached and performance will be limited by the
 raw speed of the underlying disks
 .Pq Dv this is the default .
 .Sy always
 causes every properly aligned read or write to be treated as a direct request.
 .Sy disabled
 causes the O_DIRECT flag to be silently ignored and all direct requests will
 be handled by the ARC.
 This is the default behavior for OpenZFS 2.2 and prior releases.
 .Pp
 Bypassing the ARC requires that a direct request be correctly aligned.
 For write requests the starting offset and size of the request must be
 .Sy recordsize Ns
 -aligned, if not then the unaligned portion of the request will be silently
 redirected through the ARC.
 For read requests there is no
 .Sy recordsize
 alignment restriction on either the starting offset or size.
 All direct requests must use a page-aligned memory buffer and the request
 size must be a multiple of the page size or an error is returned.
 .Pp
 Concurrently mixing buffered and direct requests to overlapping regions of
 a file can decrease performance.
 However, the resulting file will always be coherent.
 For example, a direct read after a buffered write will return the data
 from the buffered write.
 Furthermore, if an application uses
 .Xr mmap 2
 based file access then in order to maintain coherency all direct requests
 are converted to buffered requests while the file is mapped.
 Currently Direct I/O is not supported with zvols.
 If dedup is enabled on a dataset, Direct I/O writes will not check for
 deduplication.
 Deduplication and Direct I/O writes are currently incompatible.
 .It Xo
 .Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns
 .Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k
 .Xc
 Specifies a compatibility mode or literal value for the size of dnodes in the
 file system.
 The default value is
 .Sy legacy .
 Setting this property to a value other than
 .Sy legacy No requires the Sy large_dnode No pool feature to be enabled .
 .Pp
 Consider setting
 .Sy dnodesize
 to
 .Sy auto
 if the dataset uses the
 .Sy xattr Ns = Ns Sy sa
 property setting and the workload makes heavy use of extended attributes.
 This
 may be applicable to SELinux-enabled systems, Lustre servers, and Samba
 servers, for example.
 Literal values are supported for cases where the optimal
 size is known in advance and for performance testing.
 .Pp
 Leave
 .Sy dnodesize
 set to
 .Sy legacy
 if you need to receive a send stream of this dataset on a pool that doesn't
 enable the
 .Sy large_dnode
 feature, or if you need to import this pool on a system that doesn't support the
 .Sy large_dnode No feature .
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy dnsize .
 .It Xo
 .Sy encryption Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy aes-128-ccm Ns | Ns
 .Sy aes-192-ccm Ns | Ns Sy aes-256-ccm Ns | Ns Sy aes-128-gcm Ns | Ns
 .Sy aes-192-gcm Ns | Ns Sy aes-256-gcm
 .Xc
 Controls the encryption cipher suite (block cipher, key length, and mode) used
 for this dataset.
 Requires the
 .Sy encryption
 feature to be enabled on the pool.
 Requires a
 .Sy keyformat
 to be set at dataset creation time.
 .Pp
 Selecting
 .Sy encryption Ns = Ns Sy on
 when creating a dataset indicates that the default encryption suite will be
 selected, which is currently
 .Sy aes-256-gcm .
 In order to provide consistent data protection, encryption must be specified at
 dataset creation time and it cannot be changed afterwards.
 .Pp
 For more details and caveats about encryption see the
 .Sx Encryption
 section of
 .Xr zfs-load-key 8 .
 .It Sy keyformat Ns = Ns Sy raw Ns | Ns Sy hex Ns | Ns Sy passphrase
 Controls what format the user's encryption key will be provided as.
 This property is only set when the dataset is encrypted.
 .Pp
 Raw keys and hex keys must be 32 bytes long (regardless of the chosen
 encryption suite) and must be randomly generated.
 A raw key can be generated with the following command:
 .Dl # Nm dd Sy if=/dev/urandom bs=32 count=1 Sy of= Ns Pa /path/to/output/key
 .Pp
 Passphrases must be between 8 and 512 bytes long and will be processed through
 PBKDF2 before being used (see the
 .Sy pbkdf2iters
 property).
 Even though the encryption suite cannot be changed after dataset creation,
 the keyformat can be with
 .Nm zfs Cm change-key .
 .It Xo
 .Sy keylocation Ns = Ns Sy prompt Ns | Ns Sy file:// Ns Ar /absolute/file/path Ns | Ns Sy https:// Ns Ar address Ns | Ns Sy http:// Ns Ar address
 .Xc
 Controls where the user's encryption key will be loaded from by default for
 commands such as
 .Nm zfs Cm load-key
 and
 .Nm zfs Cm mount Fl l .
 This property is only set for encrypted datasets which are encryption roots.
 If unspecified, the default is
 .Sy prompt .
 .Pp
 Even though the encryption suite cannot be changed after dataset creation, the
 keylocation can be with either
 .Nm zfs Cm set
 or
 .Nm zfs Cm change-key .
 If
 .Sy prompt
 is selected ZFS will ask for the key at the command prompt when it is required
 to access the encrypted data (see
 .Nm zfs Cm load-key
 for details).
 This setting will also allow the key to be passed in via the standard input
 stream,
 but users should be careful not to place keys which should be kept secret on
 the command line.
 If a file URI is selected, the key will be loaded from the
 specified absolute file path.
 If an HTTPS or HTTP URL is selected, it will be GETted using
 .Xr fetch 3 ,
 libcurl, or nothing, depending on compile-time configuration and run-time
 availability.
 The
 .Sy SSL_CA_CERT_FILE
 environment variable can be set to set the location
 of the concatenated certificate store.
 The
 .Sy SSL_CA_CERT_PATH
 environment variable can be set to override the location
 of the directory containing the certificate authority bundle.
 The
 .Sy SSL_CLIENT_CERT_FILE
 and
 .Sy SSL_CLIENT_KEY_FILE
 environment variables can be set to configure the path
 to the client certificate and its key.
 .It Sy pbkdf2iters Ns = Ns Ar iterations
 Controls the number of PBKDF2 iterations that a
 .Sy passphrase
 encryption key should be run through when processing it into an encryption key.
 This property is only defined when encryption is enabled and a keyformat of
 .Sy passphrase
 is selected.
 The goal of PBKDF2 is to significantly increase the
 computational difficulty needed to brute force a user's passphrase.
 This is accomplished by forcing the attacker to run each passphrase through a
 computationally expensive hashing function many times before they arrive at the
 resulting key.
 A user who actually knows the passphrase will only have to pay this cost once.
 As CPUs become better at processing, this number should be
 raised to ensure that a brute force attack is still not possible.
 The current default is
 .Sy 350000
 and the minimum is
 .Sy 100000 .
 This property may be changed with
 .Nm zfs Cm change-key .
 .It Sy exec Ns = Ns Sy on Ns | Ns Sy off
 Controls whether processes can be executed from within this file system.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy exec
 and
 .Sy noexec
 mount options.
 .It Sy volthreading Ns = Ns Sy on Ns | Ns Sy off
 Controls internal zvol threading.
 The value
 .Sy off
 disables zvol threading, and zvol relies on application threads.
 The default value is
 .Sy on ,
 which enables threading within a zvol.
 Please note that this property will be overridden by
 .Sy zvol_request_sync
 module parameter.
 This property is only applicable to Linux.
 .It Sy filesystem_limit Ns = Ns Ar count Ns | Ns Sy none
 Limits the number of filesystems and volumes that can exist under this point in
 the dataset tree.
 The limit is not enforced if the user is allowed to change the limit.
 Setting a
 .Sy filesystem_limit
 to
 .Sy on
 a descendant of a filesystem that already has a
 .Sy filesystem_limit
 does not override the ancestor's
 .Sy filesystem_limit ,
 but rather imposes an additional limit.
 This feature must be enabled to be used
 .Po see
 .Xr zpool-features 7
 .Pc .
 .It Sy special_small_blocks Ns = Ns Ar size
 This value represents the threshold block size for including small file
 or zvol blocks into the special allocation class.
 Blocks smaller than or equal to this value after compression and encryption
 will be assigned to the special allocation class, while greater blocks will
 be assigned to the regular class.
 Valid values are from 0 to maximum block size (
 .Ar 16 MiB
 ).
 The default size is 0 which means no small file or zvol blocks
 will be allocated in the special class.
 .Pp
 Before setting this property, a special class vdev must be added to the
 pool.
 See
 .Xr zpoolconcepts 7
 for more details on the special allocation class.
 .It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy
 Controls the mount point used for this file system.
 See the
 .Sx Mount Points
 section of
 .Xr zfsconcepts 7
 for more information on how this property is used.
 .Pp
 When the
 .Sy mountpoint
 property is changed for a file system, the file system and any children that
 inherit the mount point are unmounted.
 If the new value is
 .Sy legacy ,
 then they remain unmounted.
 Otherwise, they are automatically remounted in the new location if the property
 was previously
 .Sy legacy
 or
 .Sy none .
 In addition, any shared file systems are unshared and shared in the new
 location.
 .Pp
 When the
 .Sy mountpoint
 property is set with
 .Nm zfs Cm set Fl u
 , the
 .Sy mountpoint
 property is updated but dataset is not mounted or unmounted and remains
 as it was before.
 .It Sy nbmand Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the file system should be mounted with
 .Sy nbmand
 .Pq Non-blocking mandatory locks .
 Changes to this property only take effect when the file system is unmounted and
 remounted.
 This was only supported by Linux prior to 5.15, and was buggy there,
 and is not supported by
 .Fx .
 On Solaris it's used for SMB clients.
 .It Sy overlay Ns = Ns Sy on Ns | Ns Sy off
 Allow mounting on a busy directory or a directory which already contains
 files or directories.
 This is the default mount behavior for Linux and
 .Fx
 file systems.
 On these platforms the property is
 .Sy on
 by default.
 Set to
 .Sy off
 to disable overlay mounts for consistency with OpenZFS on other platforms.
 .It Sy primarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
 Controls what is cached in the primary cache
 .Pq ARC .
 If this property is set to
 .Sy all ,
 then both user data and metadata is cached.
 If this property is set to
 .Sy none ,
 then neither user data nor metadata is cached.
 If this property is set to
 .Sy metadata ,
 then only metadata is cached.
 The default value is
 .Sy all .
 .It Sy quota Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space a dataset and its descendants can consume.
 This property enforces a hard limit on the amount of space used.
 This includes all space consumed by descendants, including file systems and
 snapshots.
 Setting a quota on a descendant of a dataset that already has a quota does not
 override the ancestor's quota, but rather imposes an additional limit.
 .Pp
 Quotas cannot be set on volumes, as the
 .Sy volsize
 property acts as an implicit quota.
 .It Sy snapshot_limit Ns = Ns Ar count Ns | Ns Sy none
 Limits the number of snapshots that can be created on a dataset and its
 descendants.
 Setting a
 .Sy snapshot_limit
 on a descendant of a dataset that already has a
 .Sy snapshot_limit
 does not override the ancestor's
 .Sy snapshot_limit ,
 but rather imposes an additional limit.
 The limit is not enforced if the user is allowed to change the limit.
 For example, this means that recursive snapshots taken from the global zone are
 counted against each delegated dataset within a zone.
 This feature must be enabled to be used
 .Po see
 .Xr zpool-features 7
 .Pc .
 .It Sy userquota@ Ns Ar user Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space consumed by the specified user.
 User space consumption is identified by the
 .Sy userspace@ Ns Ar user
 property.
 .Pp
 Enforcement of user quotas may be delayed by several seconds.
 This delay means that a user might exceed their quota before the system notices
 that they are over quota and begins to refuse additional writes with the
 .Er EDQUOT
 error message.
 See the
 .Nm zfs Cm userspace
 command for more information.
 .Pp
 Unprivileged users can only access their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy userquota
 privilege with
 .Nm zfs Cm allow ,
 can get and set everyone's quota.
 .Pp
 This property is not available on volumes, on file systems before version 4, or
 on pools before version 15.
 The
 .Sy userquota@ Ns Ar …
 properties are not displayed by
 .Nm zfs Cm get Sy all .
 The user's name must be appended after the
 .Sy @
 symbol, using one of the following forms:
 .Bl -bullet -compact -offset 4n
 .It
 POSIX name
 .Pq Qq joe
 .It
 POSIX numeric ID
 .Pq Qq 789
 .It
 SID name
 .Pq Qq joe.smith@mydomain
 .It
 SID numeric ID
 .Pq Qq S-1-123-456-789
 .El
 .Pp
 Files created on Linux always have POSIX owners.
 .It Sy defaultuserquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default user quota to be applied to each user for whom no
 user-specific quota is set.
 The value
 .Sy 0
 disables defaultuserquota.
 .It Sy userobjquota@ Ns Ar user Ns = Ns Ar size Ns | Ns Sy none
 The
 .Sy userobjquota
 is similar to
 .Sy userquota
 but it limits the number of objects a user can create.
 Please refer to
 .Sy userobjused
 for more information about how objects are counted.
 .It Sy defaultuserobjquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default user object quota to be applied to each user for
 whom no userobj-specific quota is set.
 The value
 .Sy 0
 disables defaultuserobjquota.
 .It Sy groupquota@ Ns Ar group Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space consumed by the specified group.
 Group space consumption is identified by the
 .Sy groupused@ Ns Ar group
 property.
 .Pp
 Unprivileged users can access only their own groups' space usage.
 The root user, or a user who has been granted the
 .Sy groupquota
 privilege with
 .Nm zfs Cm allow ,
 can get and set all groups' quotas.
 .It Sy defaultgroupquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default group quota to be applied to each group for whom no
 group-specific quota is set.
 The value
 .Sy 0
 disables defaultgroupquota.
 .It Sy groupobjquota@ Ns Ar group Ns = Ns Ar size Ns | Ns Sy none
 The
 .Sy groupobjquota
 is similar to
 .Sy groupquota
 but it limits number of objects a group can consume.
 Please refer to
 .Sy userobjused
 for more information about how objects are counted.
 .It Sy defaultgroupobjquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default group object quota to be applied to each group for
 whom no groupobj-specific quota is set.
 The value
 .Sy 0
 disables defaultgroupobjquota.
 .It Sy projectquota@ Ns Ar project Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space consumed by the specified project.
 Project space consumption is identified by the
 .Sy projectused@ Ns Ar project
 property.
 Please refer to
 .Sy projectused
 for more information about how project is identified and set/changed.
 .Pp
 The root user, or a user who has been granted the
 .Sy projectquota
 privilege with
 .Nm zfs allow ,
 can access all projects' quota.
 .It Sy defaultprojectquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default project quota to be applied to each project for whom no
 project-specific quota is set.
 The value
 .Sy 0
 disables defaultprojectquota.
 .It Sy projectobjquota@ Ns Ar project Ns = Ns Ar size Ns | Ns Sy none
 The
 .Sy projectobjquota
 is similar to
 .Sy projectquota
 but it limits number of objects a project can consume.
 Please refer to
 .Sy userobjused
 for more information about how objects are counted.
 .It Sy defaultprojectobjquota Ns = Ns Ar size Ns | Ns Sy none
 Sets a default project object quota to be applied to each project for
 whom no projectobj-specific quota is set.
 The value
 .Sy 0
 disables defaultprojectobjquota.
 .It Sy readonly Ns = Ns Sy on Ns | Ns Sy off
 Controls whether this dataset can be modified.
 The default value is
 .Sy off .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy ro
 and
 .Sy rw
 mount options.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy rdonly .
 .It Sy recordsize Ns = Ns Ar size
 Specifies a suggested block size for files in the file system.
 This property is designed solely for use with database workloads that access
 files in fixed-size records.
 ZFS automatically tunes block sizes according to internal algorithms optimized
 for typical access patterns.
 .Pp
 For databases that create very large files but access them in small random
 chunks, these algorithms may be suboptimal.
 Specifying a
 .Sy recordsize
 greater than or equal to the record size of the database can result in
 significant performance gains.
 Use of this property for general purpose file systems is strongly discouraged,
 and may adversely affect performance.
 .Pp
 The size specified must be a power of two greater than or equal to
 .Ar 512 B
 and less than or equal to
 .Ar 128 KiB .
 If the
 .Sy large_blocks
 feature is enabled on the pool, the size may be up to
 .Ar 16 MiB .
 See
 .Xr zpool-features 7
 for details on ZFS feature flags.
 .Pp
 Note that maximum size is still limited by default to
 .Ar 1 MiB
 on x86_32, see
 .Sy zfs_max_recordsize
 module parameter.
 .Pp
 Changing the file system's
 .Sy recordsize
 affects only files created afterward; existing files are unaffected.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy recsize .
 .It Sy redundant_metadata Ns = Ns Sy all Ns | Ns Sy most Ns | Ns Sy some Ns | Ns Sy none
 Controls what types of metadata are stored redundantly.
 ZFS stores an extra copy of metadata, so that if a single block is corrupted,
 the amount of user data lost is limited.
 This extra copy is in addition to any redundancy provided at the pool level
 .Pq e.g. by mirroring or RAID-Z ,
 and is in addition to an extra copy specified by the
 .Sy copies
 property
 .Pq up to a total of 3 copies .
 For example if the pool is mirrored,
 .Sy copies Ns = Ns 2 ,
 and
 .Sy redundant_metadata Ns = Ns Sy most ,
 then ZFS stores 6 copies of most metadata, and 4 copies of data and some
 metadata.
 .Pp
 When set to
 .Sy all ,
 ZFS stores an extra copy of all metadata.
 If a single on-disk block is corrupt, at worst a single block of user data
 .Po which is
 .Sy recordsize
 bytes long
 .Pc
 can be lost.
 .Pp
 When set to
 .Sy most ,
 ZFS stores an extra copy of most types of metadata.
 This can improve performance of random writes, because less metadata must be
 written.
 In practice, at worst about 1000 blocks
 .Po of
 .Sy recordsize
 bytes each
 .Pc
 of user data can be lost if a single on-disk block is corrupt.
 The exact behavior of which metadata blocks are stored redundantly may change in
 future releases.
 .Pp
 When set to
 .Sy some ,
 ZFS stores an extra copy of only critical metadata.
 This can improve file create performance since less metadata
 needs to be written.
 If a single on-disk block is corrupt, multiple user files or directories
 can be lost.
 .Pp
 When set to
 .Sy none ,
 ZFS does not store any copies of metadata redundantly.
 If a single on-disk block is corrupt, an entire dataset can be lost.
 .Pp
 The default value is
 .Sy all .
 .It Sy refquota Ns = Ns Ar size Ns | Ns Sy none
 Limits the amount of space a dataset can consume.
 This property enforces a hard limit on the amount of space used.
 This hard limit does not include space used by descendants, including file
 systems and snapshots.
 .It Sy refreservation Ns = Ns Ar size Ns | Ns Sy none Ns | Ns Sy auto
 The minimum amount of space guaranteed to a dataset, not including its
 descendants.
 When the amount of space used is below this value, the dataset is treated as if
 it were taking up the amount of space specified by
 .Sy refreservation .
 The
 .Sy refreservation
 reservation is accounted for in the parent datasets' space used, and counts
 against the parent datasets' quotas and reservations.
 .Pp
 If
 .Sy refreservation
 is set, a snapshot is only allowed if there is enough free pool space outside of
 this reservation to accommodate the current number of
 .Qq referenced
 bytes in the dataset.
 .Pp
 If
 .Sy refreservation
 is set to
 .Sy auto ,
 a volume is thick provisioned
 .Po or
 .Qq not sparse
 .Pc .
 .Sy refreservation Ns = Ns Sy auto
 is only supported on volumes.
 See
 .Sy volsize
 in the
 .Sx Native Properties
 section for more information about sparse volumes.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy refreserv .
 .It Sy relatime Ns = Ns Sy on Ns | Ns Sy off
 Controls the manner in which the access time is updated when
 .Sy atime Ns = Ns Sy on
 is set.
 Turning this property on causes the access time to be updated relative
 to the modify or change time.
 Access time is only updated if the previous
 access time was earlier than the current modify or change time or if the
 existing access time hasn't been updated within the past 24 hours.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy relatime
 and
 .Sy norelatime
 mount options.
 .It Sy reservation Ns = Ns Ar size Ns | Ns Sy none
 The minimum amount of space guaranteed to a dataset and its descendants.
 When the amount of space used is below this value, the dataset is treated as if
 it were taking up the amount of space specified by its reservation.
 Reservations are accounted for in the parent datasets' space used, and count
 against the parent datasets' quotas and reservations.
 .Pp
 This property can also be referred to by its shortened column name,
 .Sy reserv .
 .It Sy secondarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
 Controls what is cached in the secondary cache
 .Pq L2ARC .
 If this property is set to
 .Sy all ,
 then both user data and metadata is cached.
 If this property is set to
 .Sy none ,
 then neither user data nor metadata is cached.
 If this property is set to
 .Sy metadata ,
 then only metadata is cached.
 The default value is
 .Sy all .
 .It Sy prefetch Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata
 Controls what speculative prefetch does.
 If this property is set to
 .Sy all ,
 then both user data and metadata are prefetched.
 If this property is set to
 .Sy none ,
 then neither user data nor metadata are prefetched.
 If this property is set to
 .Sy metadata ,
 then only metadata are prefetched.
 The default value is
 .Sy all .
 .Pp
 Please note that the module parameter zfs_prefetch_disable=1 can
 be used to totally disable speculative prefetch, bypassing anything
 this property does.
 .It Sy setuid Ns = Ns Sy on Ns | Ns Sy off
 Controls whether the setuid bit is respected for the file system.
 The default value is
 .Sy on .
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy suid
 and
 .Sy nosuid
 mount options.
 .It Sy sharesmb Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts
 Controls whether the file system is shared by using
 .Sy Samba USERSHARES
 and what options are to be used.
 Otherwise, the file system is automatically shared and unshared with the
 .Nm zfs Cm share
 and
 .Nm zfs Cm unshare
 commands.
 If the property is set to on, the
 .Xr net 8
 command is invoked to create a
 .Sy USERSHARE .
 .Pp
 Because SMB shares requires a resource name, a unique resource name is
 constructed from the dataset name.
 The constructed name is a copy of the
 dataset name except that the characters in the dataset name, which would be
 invalid in the resource name, are replaced with underscore (_) characters.
 Linux does not currently support additional options which might be available
 on Solaris.
 .Pp
 If the
 .Sy sharesmb
 property is set to
 .Sy off ,
 the file systems are unshared.
 .Pp
 The share is created with the ACL (Access Control List) "Everyone:F" ("F"
 stands for "full permissions", i.e. read and write permissions) and no guest
 access (which means Samba must be able to authenticate a real user \(em
 .Xr passwd 5 Ns / Ns Xr shadow 5 Ns - ,
 LDAP- or
 .Xr smbpasswd 5 Ns -based )
 by default.
 This means that any additional access control
 (disallow specific user specific access etc) must be done on the underlying file
 system.
 .Pp
 When the
 .Sy sharesmb
 property is updated with
 .Nm zfs Cm set Fl u
 , the property is set to desired value, but the operation to share, reshare
 or unshare the the dataset is not performed.
 .It Sy sharenfs Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Ar opts
 Controls whether the file system is shared via NFS, and what options are to be
 used.
 A file system with a
 .Sy sharenfs
 property of
 .Sy off
 is managed with the
 .Xr exportfs 8
 command and entries in the
 .Pa /etc/exports
 file.
 Otherwise, the file system is automatically shared and unshared with the
 .Nm zfs Cm share
 and
 .Nm zfs Cm unshare
 commands.
 If the property is set to
 .Sy on ,
 the dataset is shared using the default options:
 .Dl sec=sys,rw,crossmnt,no_subtree_check
 .Pp
 Please note that the options are comma-separated, unlike those found in
 .Xr exports 5 .
 This is done to negate the need for quoting, as well as to make parsing
 with scripts easier.
 .Pp
 For
 .Fx ,
 there may be multiple sets of options separated by semicolon(s).
 Each set of options must apply to different hosts or networks and each
 set of options will create a separate line for
 .Xr exports 5 .
 Any semicolon separated option set that consists entirely of whitespace
 will be ignored.
 This use of semicolons is only for
 .Fx
 at this time.
 .Pp
 See
 .Xr exports 5
 for the meaning of the default options.
 Otherwise, the
 .Xr exportfs 8
 command is invoked with options equivalent to the contents of this property.
 .Pp
 When the
 .Sy sharenfs
 property is changed for a dataset, the dataset and any children inheriting the
 property are re-shared with the new options, only if the property was previously
 .Sy off ,
 or if they were shared before the property was changed.
 If the new property is
 .Sy off ,
 the file systems are unshared.
 .Pp
 When the
 .Sy sharenfs
 property is updated with
 .Nm zfs Cm set Fl u
 , the property is set to desired value, but the operation to share, reshare
 or unshare the the dataset is not performed.
 .It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput
 Provide a hint to ZFS about handling of synchronous write requests in this
 dataset.
 If
 .Sy logbias
 is set to
 .Sy latency
 .Pq the default ,
 ZFS will use pool log devices
 .Pq if configured
 to handle the write requests at low latency.
 If
 .Sy logbias
 is set to
 .Sy throughput ,
 ZFS will not use configured pool log devices to store written data.
 ZFS will instead optimize synchronous operations for global pool throughput and
 efficient use of resources.
 .It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible
 Controls whether the volume snapshot devices under
 .Pa /dev/zvol/ Ns Aq Ar pool
 are hidden or visible.
 The default value is
 .Sy hidden .
 .It Sy snapdir Ns = Ns Sy disabled Ns | Ns Sy hidden Ns | Ns Sy visible
 Controls whether the
 .Pa .zfs
 directory is disabled, hidden or visible in the root of the file system as
 discussed in the
 .Sx Snapshots
 section of
 .Xr zfsconcepts 7 .
 The default value is
 .Sy hidden .
 .It Sy sync Ns = Ns Sy standard Ns | Ns Sy always Ns | Ns Sy disabled
 Controls the behavior of synchronous requests
 .Pq e.g. fsync, O_DSYNC .
 .Sy standard
 is the POSIX-specified behavior of ensuring all synchronous requests
 are written to stable storage and all devices are flushed to ensure
 data is not cached by device controllers
 .Pq this is the default .
 .Sy always
 causes every file system transaction to be written and flushed before its
 system call returns.
 This has a large performance penalty.
 .Sy disabled
 disables synchronous requests.
 File system transactions are only committed to stable storage periodically.
 This option will give the highest performance.
 However, it is very dangerous as ZFS would be ignoring the synchronous
 transaction demands of applications such as databases or NFS.
 Administrators should only use this option when the risks are understood.
 .It Sy version Ns = Ns Ar N Ns | Ns Sy current
 The on-disk version of this file system, which is independent of the pool
 version.
 This property can only be set to later supported versions.
 See the
 .Nm zfs Cm upgrade
 command.
 .It Sy volsize Ns = Ns Ar size
 For volumes, specifies the logical size of the volume.
 By default, creating a volume establishes a reservation of equal size.
 For storage pools with a version number of 9 or higher, a
 .Sy refreservation
 is set instead.
 Any changes to
 .Sy volsize
 are reflected in an equivalent change to the reservation
 .Pq or Sy refreservation .
 The
 .Sy volsize
 can only be set to a multiple of
 .Sy volblocksize ,
 and cannot be zero.
 .Pp
 The reservation is kept equal to the volume's logical size to prevent unexpected
 behavior for consumers.
 Without the reservation, the volume could run out of space, resulting in
 undefined behavior or data corruption, depending on how the volume is used.
 These effects can also occur when the volume size is changed while it is in use
 .Pq particularly when shrinking the size .
 Extreme care should be used when adjusting the volume size.
 .Pp
 Though not recommended, a
 .Qq sparse volume
 .Po also known as
 .Qq thin provisioned
 .Pc
 can be created by specifying the
 .Fl s
 option to the
 .Nm zfs Cm create Fl V
 command, or by changing the value of the
 .Sy refreservation
 property
 .Po or
 .Sy reservation
 property on pool version 8 or earlier
 .Pc
 after the volume has been created.
 A
 .Qq sparse volume
 is a volume where the value of
 .Sy refreservation
 is less than the size of the volume plus the space required to store its
 metadata.
 Consequently, writes to a sparse volume can fail with
 .Er ENOSPC
 when the pool is low on space.
 For a sparse volume, changes to
 .Sy volsize
 are not reflected in the
 .Sy refreservation .
 A volume that is not sparse is said to be
 .Qq thick provisioned .
 A sparse volume can become thick provisioned by setting
 .Sy refreservation
 to
 .Sy auto .
 .It Sy volmode Ns = Ns Sy default Ns | Ns Sy full Ns | Ns Sy geom Ns | Ns Sy dev Ns | Ns Sy none
 This property specifies how volumes should be exposed to the OS.
 Setting it to
 .Sy full
 exposes volumes as fully fledged block devices, providing maximal
 functionality.
 The value
 .Sy geom
 is just an alias for
 .Sy full
 and is kept for compatibility.
 Setting it to
 .Sy dev
 hides its partitions.
 Volumes with property set to
 .Sy none
 are not exposed outside ZFS, but can be snapshotted, cloned, replicated, etc,
 that can be suitable for backup purposes.
 Value
 .Sy default
 means that volumes exposition is controlled by system-wide tunable
 .Sy zvol_volmode ,
 where
 .Sy full ,
 .Sy dev
 and
 .Sy none
 are encoded as 1, 2 and 3 respectively.
 The default value is
 .Sy full .
 .It Sy vscan Ns = Ns Sy on Ns | Ns Sy off
 Controls whether regular files should be scanned for viruses when a file is
 opened and closed.
 In addition to enabling this property, the virus scan service must also be
 enabled for virus scanning to occur.
 The default value is
 .Sy off .
 This property is not used by OpenZFS.
 .It Sy xattr Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy dir Ns | Ns Sy sa
 Controls whether extended attributes are enabled for this file system.
 Two styles of extended attributes are supported: either directory-based
 or system-attribute-based.
 .Pp
 Directory-based extended attributes can be enabled by setting the value to
 .Sy dir .
 This style of extended attribute imposes no practical limit
 on either the size or number of attributes which can be set on a file.
 Although under Linux the
 .Xr getxattr 2
 and
 .Xr setxattr 2
 system calls limit the maximum size to
 .Sy 64K .
 This is the most compatible
 style of extended attribute and is supported by all ZFS implementations.
 .Pp
 System-attribute-based xattrs can be enabled by setting the value to
 .Sy sa
 (default and equal to
 .Sy on
 ) .
 The key advantage of this type of xattr is improved performance.
 Storing extended attributes as system attributes
 significantly decreases the amount of disk I/O required.
 Up to
 .Sy 64K
 of data may be stored per-file in the space reserved for system attributes.
 If there is not enough space available for an extended attribute
 then it will be automatically written as a directory-based xattr.
 System-attribute-based extended attributes are not accessible
 on platforms which do not support the
 .Sy xattr Ns = Ns Sy sa
 feature.
 OpenZFS supports
 .Sy xattr Ns = Ns Sy sa
 on both
 .Fx
 and Linux.
 .Pp
 The use of system-attribute-based xattrs is strongly encouraged for users of
 SELinux or POSIX ACLs.
 Both of these features heavily rely on extended
 attributes and benefit significantly from the reduced access time.
 .Pp
 The values
 .Sy on
 and
 .Sy off
 are equivalent to the
 .Sy xattr
 and
 .Sy noxattr
 mount options.
 .It Sy jailed Ns = Ns Sy off Ns | Ns Sy on
 Controls whether the dataset is managed from a jail.
 See
 .Xr zfs-jail 8
 for more information.
 Jails are a
 .Fx
 feature and this property is not available on other platforms.
 .It Sy zoned Ns = Ns Sy off Ns | Ns Sy on
 Controls whether the dataset is managed from a non-global zone or namespace.
 See
 .Xr zfs-zone 8
 for more information.
 Zoning is a
 Linux
 feature and this property is not available on other platforms.
 .El
 .Pp
 The following three properties cannot be changed after the file system is
 created, and therefore, should be set when the file system is created.
 If the properties are not set with the
 .Nm zfs Cm create
 or
 .Nm zpool Cm create
 commands, these properties are inherited from the parent dataset.
 If the parent dataset lacks these properties due to having been created prior to
 these features being supported, the new file system will have the default values
 for these properties.
 .Bl -tag -width ""
 .It Xo
 .Sy casesensitivity Ns = Ns Sy sensitive Ns | Ns
 .Sy insensitive Ns | Ns Sy mixed
 .Xc
 Indicates whether the file name matching algorithm used by the file system
 should be case-sensitive, case-insensitive, or allow a combination of both
 styles of matching.
 The default value for the
 .Sy casesensitivity
 property is
 .Sy sensitive .
 Traditionally,
 .Ux
 and POSIX file systems have case-sensitive file names.
 .Pp
 The
 .Sy mixed
 value for the
 .Sy casesensitivity
 property indicates that the file system can support requests for both
 case-sensitive and case-insensitive matching behavior.
 Currently, case-insensitive matching behavior on a file system that supports
 mixed behavior is limited to the SMB server product.
 For more information about the
 .Sy mixed
 value behavior, see the "ZFS Administration Guide".
 .It Xo
 .Sy normalization Ns = Ns Sy none Ns | Ns Sy formC Ns | Ns
 .Sy formD Ns | Ns Sy formKC Ns | Ns Sy formKD
 .Xc
 Indicates whether the file system should perform a
 .Sy Unicode
 normalization of file names whenever two file names are compared, and which
 normalization algorithm should be used.
 File names are always stored unmodified, names are normalized as part of any
 comparison process.
 If this property is set to a legal value other than
 .Sy none ,
 and the
 .Sy utf8only
 property was left unspecified, the
 .Sy utf8only
 property is automatically set to
 .Sy on .
 The default value of the
 .Sy normalization
 property is
 .Sy none .
 This property cannot be changed after the file system is created.
 .It Sy utf8only Ns = Ns Sy on Ns | Ns Sy off
 Indicates whether the file system should reject file names that include
 characters that are not present in the
 .Sy UTF-8
 character code set.
 If this property is explicitly set to
 .Sy off ,
 the normalization property must either not be explicitly set or be set to
 .Sy none .
 The default value for the
 .Sy utf8only
 property is
 .Sy off .
 This property cannot be changed after the file system is created.
 .El
 .Pp
 The
 .Sy casesensitivity ,
 .Sy normalization ,
 and
 .Sy utf8only
 properties are also new permissions that can be assigned to non-privileged users
 by using the ZFS delegated administration feature.
 .
 .Ss Temporary Mount Point Properties
 When a file system is mounted, either through
 .Xr mount 8
 for legacy mounts or the
 .Nm zfs Cm mount
 command for normal file systems, its mount options are set according to its
 properties.
 The correlation between properties and mount options is as follows:
 .Bl -tag -compact -offset Ds -width "rootcontext="
 .It Sy atime
 atime/noatime
 .It Sy canmount
 auto/noauto
 .It Sy devices
 dev/nodev
 .It Sy exec
 exec/noexec
 .It Sy readonly
 ro/rw
 .It Sy relatime
 relatime/norelatime
 .It Sy setuid
 suid/nosuid
 .It Sy xattr
 xattr/noxattr
 .It Sy nbmand
 mand/nomand
 .It Sy context Ns =
 context=
 .It Sy fscontext Ns =
 fscontext=
 .It Sy defcontext Ns =
 defcontext=
 .It Sy rootcontext Ns =
 rootcontext=
 .El
 .Pp
 In addition, these options can be set on a per-mount basis using the
 .Fl o
 option, without affecting the property that is stored on disk.
 The values specified on the command line override the values stored in the
 dataset.
 The
 .Sy nosuid
 option is an alias for
 .Sy nodevices , Ns Sy nosetuid .
 These properties are reported as
 .Qq temporary
 by the
 .Nm zfs Cm get
 command.
 If the properties are changed while the dataset is mounted, the new setting
 overrides any temporary settings.
 .
 .Ss User Properties
 In addition to the standard native properties, ZFS supports arbitrary user
 properties.
 User properties have no effect on ZFS behavior, but applications or
 administrators can use them to annotate datasets
 .Pq file systems, volumes, and snapshots .
 .Pp
 User property names must contain a colon
 .Pq Qq Sy \&:
 character to distinguish them from native properties.
 They may contain lowercase letters, numbers, and the following punctuation
 characters: colon
 .Pq Qq Sy \&: ,
 dash
 .Pq Qq Sy - ,
 period
 .Pq Qq Sy \&. ,
 and underscore
 .Pq Qq Sy _ .
 The expected convention is that the property name is divided into two portions
 such as
 .Ar module : Ns Ar property ,
 but this namespace is not enforced by ZFS.
 User property names can be at most 256 characters, and cannot begin with a dash
 .Pq Qq Sy - .
 .Pp
 When making programmatic use of user properties, it is strongly suggested to use
 a reversed DNS domain name for the
 .Ar module
 component of property names to reduce the chance that two
 independently-developed packages use the same property name for different
 purposes.
 .Pp
 The values of user properties are arbitrary strings, are always inherited, and
 are never validated.
 All of the commands that operate on properties
 .Po Nm zfs Cm list ,
 .Nm zfs Cm get ,
 .Nm zfs Cm set ,
 and so forth
 .Pc
 can be used to manipulate both native properties and user properties.
 Use the
 .Nm zfs Cm inherit
 command to clear a user property.
 If the property is not defined in any parent dataset, it is removed entirely.
 Property values are limited to 8192 bytes.
diff --git a/sys/contrib/openzfs/man/man7/zpool-features.7 b/sys/contrib/openzfs/man/man7/zpool-features.7
index 10dfd1f92936..b4404a6eb58d 100644
--- a/sys/contrib/openzfs/man/man7/zpool-features.7
+++ b/sys/contrib/openzfs/man/man7/zpool-features.7
@@ -1,1102 +1,1102 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
 .\" usr/src/OPENSOLARIS.LICENSE or https://opensource.org/licenses/CDDL-1.0.
 .\"
 .\" See the License for the specific language governing permissions and
 .\" limitations under the License. When distributing Covered Code, include this
 .\" CDDL HEADER in each file and include the License file at
 .\" usr/src/OPENSOLARIS.LICENSE.  If applicable, add the following below this
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
 .\" Copyright (c) 2019, 2023, 2024, Klara, Inc.
 .\" Copyright (c) 2019, Allan Jude
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
 .\"
-.Dd October 2, 2024
+.Dd July 23, 2025
 .Dt ZPOOL-FEATURES 7
 .Os
 .
 .Sh NAME
 .Nm zpool-features
 .Nd description of ZFS pool features
 .
 .Sh DESCRIPTION
 ZFS pool on-disk format versions are specified via
 .Dq features
 which replace the old on-disk format numbers
 .Pq the last supported on-disk format number is 28 .
 To enable a feature on a pool use the
 .Nm zpool Cm upgrade ,
 or set the
 .Sy feature Ns @ Ns Ar feature-name
 property to
 .Sy enabled .
 Please also see the
 .Sx Compatibility feature sets
 section for information on how sets of features may be enabled together.
 .Pp
 The pool format does not affect file system version compatibility or the ability
 to send file systems between pools.
 .Pp
 Since most features can be enabled independently of each other, the on-disk
 format of the pool is specified by the set of all features marked as
 .Sy active
 on the pool.
 If the pool was created by another software version
 this set may include unsupported features.
 .
 .Ss Identifying features
 Every feature has a GUID of the form
 .Ar com.example : Ns Ar feature-name .
 The reversed DNS name ensures that the feature's GUID is unique across all ZFS
 implementations.
 When unsupported features are encountered on a pool they will
 be identified by their GUIDs.
 Refer to the documentation for the ZFS
 implementation that created the pool for information about those features.
 .Pp
 Each supported feature also has a short name.
 By convention a feature's short name is the portion of its GUID which follows
 the
 .Sq \&:
 .Po
 i.e.
 .Ar com.example : Ns Ar feature-name
 would have the short name
 .Ar feature-name
 .Pc ,
 however a feature's short name may differ across ZFS implementations if
 following the convention would result in name conflicts.
 .
 .Ss Feature states
 Features can be in one of three states:
 .Bl -tag -width "disabled"
 .It Sy active
 This feature's on-disk format changes are in effect on the pool.
 Support for this feature is required to import the pool in read-write mode.
 If this feature is not read-only compatible,
 support is also required to import the pool in read-only mode
 .Pq see Sx Read-only compatibility .
 .It Sy enabled
 An administrator has marked this feature as enabled on the pool, but the
 feature's on-disk format changes have not been made yet.
 The pool can still be imported by software that does not support this feature,
 but changes may be made to the on-disk format at any time
 which will move the feature to the
 .Sy active
 state.
 Some features may support returning to the
 .Sy enabled
 state after becoming
 .Sy active .
 See feature-specific documentation for details.
 .It Sy disabled
 This feature's on-disk format changes have not been made and will not be made
 unless an administrator moves the feature to the
 .Sy enabled
 state.
 Features cannot be disabled once they have been enabled.
 .El
 .Pp
 The state of supported features is exposed through pool properties of the form
 .Sy feature Ns @ Ns Ar short-name .
 .
 .Ss Read-only compatibility
 Some features may make on-disk format changes that do not interfere with other
 software's ability to read from the pool.
 These features are referred to as
 .Dq read-only compatible .
 If all unsupported features on a pool are read-only compatible,
 the pool can be imported in read-only mode by setting the
 .Sy readonly
 property during import
 .Po see
 .Xr zpool-import 8
 for details on importing pools
 .Pc .
 .
 .Ss Unsupported features
 For each unsupported feature enabled on an imported pool, a pool property
 named
 .Sy unsupported Ns @ Ns Ar feature-name
 will indicate why the import was allowed despite the unsupported feature.
 Possible values for this property are:
 .Bl -tag -width "readonly"
 .It Sy inactive
 The feature is in the
 .Sy enabled
 state and therefore the pool's on-disk
 format is still compatible with software that does not support this feature.
 .It Sy readonly
 The feature is read-only compatible and the pool has been imported in
 read-only mode.
 .El
 .
 .Ss Feature dependencies
 Some features depend on other features being enabled in order to function.
 Enabling a feature will automatically enable any features it depends on.
 .
 .Ss Compatibility feature sets
 It is sometimes necessary for a pool to maintain compatibility with a
 specific on-disk format, by enabling and disabling particular features.
 The
 .Sy compatibility
 feature facilitates this by allowing feature sets to be read from text files.
 When set to
 .Sy off
 .Pq the default ,
 compatibility feature sets are disabled
 .Pq i.e. all features are enabled ;
 when set to
 .Sy legacy ,
 no features are enabled.
 When set to a comma-separated list of filenames
 .Po
 each filename may either be an absolute path, or relative to
 .Pa /etc/zfs/compatibility.d
 or
 .Pa /usr/share/zfs/compatibility.d
 .Pc ,
 the lists of requested features are read from those files,
 separated by whitespace and/or commas.
 Only features present in all files are enabled.
 .Pp
 Simple sanity checks are applied to the files:
 they must be between 1 B and 16 KiB in size, and must end with a newline
 character.
 .Pp
 The requested features are applied when a pool is created using
 .Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar …
 and controls which features are enabled when using
 .Nm zpool Cm upgrade .
 .Nm zpool Cm status
 will not show a warning about disabled features which are not part
 of the requested feature set.
 .Pp
 The special value
 .Sy legacy
 prevents any features from being enabled, either via
 .Nm zpool Cm upgrade
 or
 .Nm zpool Cm set Sy feature Ns @ Ns Ar feature-name Ns = Ns Sy enabled .
 This setting also prevents pools from being upgraded to newer on-disk versions.
 This is a safety measure to prevent new features from being
 accidentally enabled, breaking compatibility.
 .Pp
 By convention, compatibility files in
 .Pa /usr/share/zfs/compatibility.d
 are provided by the distribution, and include feature sets
 supported by important versions of popular distributions, and feature
 sets commonly supported at the start of each year.
 Compatibility files in
 .Pa /etc/zfs/compatibility.d ,
 if present, will take precedence over files with the same name in
 .Pa /usr/share/zfs/compatibility.d .
 .Pp
 If an unrecognized feature is found in these files, an error message will
 be shown.
 If the unrecognized feature is in a file in
 .Pa /etc/zfs/compatibility.d ,
 this is treated as an error and processing will stop.
 If the unrecognized feature is under
 .Pa /usr/share/zfs/compatibility.d ,
 this is treated as a warning and processing will continue.
 This difference is to allow distributions to include features
 which might not be recognized by the currently-installed binaries.
 .Pp
 Compatibility files may include comments:
 any text from
 .Sq #
 to the end of the line is ignored.
 .Pp
 .Sy Example :
 .Bd -literal -compact -offset 4n
 .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2
 # Features which are supported by GRUB2 versions from v2.12 onwards.
 allocation_classes
 async_destroy
 block_cloning
 bookmarks
 device_rebuild
 embedded_data
 empty_bpobj
 enabled_txg
 extensible_dataset
 filesystem_limits
 hole_birth
 large_blocks
 livelist
 log_spacemap
 lz4_compress
 project_quota
 resilver_defer
 spacemap_histogram
 spacemap_v2
 userobj_accounting
 zilsaxattr
 zpool_checkpoint
 
 .No example# Nm cat Pa /usr/share/zfs/compatibility.d/grub2-2.06
 # Features which are supported by GRUB2 versions prior to v2.12.
 #
 # GRUB is not able to detect ZFS pool if snapshot of top level boot pool
 # is created. This issue is observed with GRUB versions before v2.12 if
 # extensible_dataset feature is enabled on ZFS boot pool.
 #
 # This file lists all read-only compatible features except
 # extensible_dataset and any other feature that depends on it.
 #
 allocation_classes
 async_destroy
 block_cloning
 device_rebuild
 embedded_data
 empty_bpobj
 enabled_txg
 hole_birth
 log_spacemap
 lz4_compress
 resilver_defer
 spacemap_histogram
 spacemap_v2
 zpool_checkpoint
 
 .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev
 .Ed
 .Pp
 See
 .Xr zpool-create 8
 and
 .Xr zpool-upgrade 8
 for more information on how these commands are affected by feature sets.
 .
 .de feature
 .It Sy \\$2
 .Bl -tag -compact -width "READ-ONLY COMPATIBLE"
 .It GUID
 .Sy \\$1:\\$2
 .if !"\\$4"" \{\
 .It DEPENDENCIES
 \fB\\$4\fP\c
 .if !"\\$5"" , \fB\\$5\fP\c
 .if !"\\$6"" , \fB\\$6\fP\c
 .if !"\\$7"" , \fB\\$7\fP\c
 .if !"\\$8"" , \fB\\$8\fP\c
 .if !"\\$9"" , \fB\\$9\fP\c
 .\}
 .It READ-ONLY COMPATIBLE
 \\$3
 .El
 .Pp
 ..
 .
 .ds instant-never \
 .No This feature becomes Sy active No as soon as it is enabled \
 and will never return to being Sy enabled .
 .
 .ds remount-upgrade \
 .No Each filesystem will be upgraded automatically when remounted, \
 or when a new file is created under that filesystem. \
 The upgrade can also be triggered on filesystems via \
 Nm zfs Cm set Sy version Ns = Ns Sy current Ar fs . \
 No The upgrade process runs in the background and may take a while to complete \
 for filesystems containing large amounts of files .
 .
 .de checksum-spiel
 When the
 .Sy \\$1
 feature is set to
 .Sy enabled ,
 the administrator can turn on the
 .Sy \\$1
 checksum on any dataset using
 .Nm zfs Cm set Sy checksum Ns = Ns Sy \\$1 Ar dset
 .Po see Xr zfs-set 8 Pc .
 This feature becomes
 .Sy active
 once a
 .Sy checksum
 property has been set to
 .Sy \\$1 ,
 and will return to being
 .Sy enabled
 once all filesystems that have ever had their checksum set to
 .Sy \\$1
 are destroyed.
 ..
 .
 .Sh FEATURES
 The following features are supported on this system:
 .Bl -tag -width Ds
 .feature org.zfsonlinux allocation_classes yes
 This feature enables support for separate allocation classes.
 .Pp
 This feature becomes
 .Sy active
 when a dedicated allocation class vdev
 .Pq dedup or special
 is created with the
 .Nm zpool Cm create No or Nm zpool Cm add No commands .
 With device removal, it can be returned to the
 .Sy enabled
 state if all the dedicated allocation class vdevs are removed.
 .
 .feature com.delphix async_destroy yes
 Destroying a file system requires traversing all of its data in order to
 return its used space to the pool.
 Without
 .Sy async_destroy ,
 the file system is not fully removed until all space has been reclaimed.
 If the destroy operation is interrupted by a reboot or power outage,
 the next attempt to open the pool will need to complete the destroy
 operation synchronously.
 .Pp
 When
 .Sy async_destroy
 is enabled, the file system's data will be reclaimed by a background process,
 allowing the destroy operation to complete
 without traversing the entire file system.
 The background process is able to resume
 interrupted destroys after the pool has been opened, eliminating the need
 to finish interrupted destroys as part of the open operation.
 The amount of space remaining to be reclaimed by the background process
 is available through the
 .Sy freeing
 property.
 .Pp
 This feature is only
 .Sy active
 while
 .Sy freeing
 is non-zero.
 .
 .feature org.openzfs blake3 no extensible_dataset
 This feature enables the use of the BLAKE3 hash algorithm for checksum and
 dedup.
 BLAKE3 is a secure hash algorithm focused on high performance.
 .Pp
 .checksum-spiel blake3
 .
 .feature com.fudosecurity block_cloning yes
 When this feature is enabled ZFS will use block cloning for operations like
 .Fn copy_file_range 2 .
 Block cloning allows to create multiple references to a single block.
 It is much faster than copying the data (as the actual data is neither read nor
 written) and takes no additional space.
 Blocks can be cloned across datasets under some conditions (like equal
 .Nm recordsize ,
 the same master encryption key, etc.).
 ZFS tries its best to clone across datasets including encrypted ones.
 This is limited for various (nontrivial) reasons depending on the OS
 and/or ZFS internals.
 .Pp
 This feature becomes
 .Sy active
 when first block is cloned.
 When the last cloned block is freed, it goes back to the enabled state.
 .feature com.truenas block_cloning_endian yes
 This feature corrects ZAP entry endianness issues in the Block Reference
 Table (BRT) used by block cloning.
 During the original block cloning implementation, BRT ZAP entries were
 mistakenly stored as arrays of 8 single-byte entries instead of single
 8-byte entries, making pools non-endian-safe.
 .Pp
 This feature is activated when the first BRT ZAP is created (that way
 ensuring compatibility with existing pools).
 When active, new BRT entries are stored in the correct endian-safe format.
 The feature becomes inactive when all BRT ZAPs are destroyed.
 .feature com.delphix bookmarks yes extensible_dataset
 This feature enables use of the
 .Nm zfs Cm bookmark
 command.
 .Pp
 This feature is
 .Sy active
 while any bookmarks exist in the pool.
 All bookmarks in the pool can be listed by running
 .Nm zfs Cm list Fl t Sy bookmark Fl r Ar poolname .
 .
 .feature com.datto bookmark_v2 no bookmark extensible_dataset
 This feature enables the creation and management of larger bookmarks which are
 needed for other features in ZFS.
 .Pp
 This feature becomes
 .Sy active
 when a v2 bookmark is created and will be returned to the
 .Sy enabled
 state when all v2 bookmarks are destroyed.
 .
 .feature com.delphix bookmark_written no bookmark extensible_dataset bookmark_v2
 This feature enables additional bookmark accounting fields, enabling the
 .Sy written Ns # Ns Ar bookmark
 property
 .Pq space written since a bookmark
 and estimates of send stream sizes for incrementals from bookmarks.
 .Pp
 This feature becomes
 .Sy active
 when a bookmark is created and will be
 returned to the
 .Sy enabled
 state when all bookmarks with these fields are destroyed.
 .
 .feature org.openzfs device_rebuild yes
 This feature enables the ability for the
 .Nm zpool Cm attach
 and
 .Nm zpool Cm replace
 commands to perform sequential reconstruction
 .Pq instead of healing reconstruction
 when resilvering.
 .Pp
 Sequential reconstruction resilvers a device in LBA order without immediately
 verifying the checksums.
 Once complete, a scrub is started, which then verifies the checksums.
 This approach allows full redundancy to be restored to the pool
 in the minimum amount of time.
 This two-phase approach will take longer than a healing resilver
 when the time to verify the checksums is included.
 However, unless there is additional pool damage,
 no checksum errors should be reported by the scrub.
 This feature is incompatible with raidz configurations.
 .
 This feature becomes
 .Sy active
 while a sequential resilver is in progress, and returns to
 .Sy enabled
 when the resilver completes.
 .
 .feature com.delphix device_removal no
 This feature enables the
 .Nm zpool Cm remove
 command to remove top-level vdevs,
 evacuating them to reduce the total size of the pool.
 .Pp
 This feature becomes
 .Sy active
 when the
 .Nm zpool Cm remove
 command is used
 on a top-level vdev, and will never return to being
 .Sy enabled .
 .
 .feature org.openzfs draid no
 This feature enables use of the
 .Sy draid
 vdev type.
 dRAID is a variant of RAID-Z which provides integrated distributed
 hot spares that allow faster resilvering while retaining the benefits of RAID-Z.
 Data, parity, and spare space are organized in redundancy groups
 and distributed evenly over all of the devices.
 .Pp
 This feature becomes
 .Sy active
 when creating a pool which uses the
 .Sy draid
 vdev type, or when adding a new
 .Sy draid
 vdev to an existing pool.
 .
 .feature com.klarasystems dynamic_gang_header no
 This feature enables larger gang headers based on the sector size of the pool.
 When enabled, gang headers will use the entire space allocated for them, instead
 of always restricting themselves to 512 bytes.
 This can reduce the need for nested gang trees in extreme fragmentation
 scenarios.
 .Pp
 This feature becomes active when a gang header is written that is larger than
 512 bytes.
 This feature is not enabled by
 .Xr zpool-upgrade 8 .
 Instead, it must be manually enabled, or be part of a compatibility file.
 .
 .feature org.illumos edonr no extensible_dataset
 This feature enables the use of the Edon-R hash algorithm for checksum,
 including for nopwrite
 .Po if compression is also enabled, an overwrite of
 a block whose checksum matches the data being written will be ignored
 .Pc .
 In an abundance of caution, Edon-R requires verification when used with
 dedup:
 .Nm zfs Cm set Sy dedup Ns = Ns Sy edonr , Ns Sy verify
 .Po see Xr zfs-set 8 Pc .
 .Pp
 Edon-R is a very high-performance hash algorithm that was part
 of the NIST SHA-3 competition.
 It provides extremely high hash performance
 .Pq over 350% faster than SHA-256 ,
 but was not selected because of its unsuitability
 as a general purpose secure hash algorithm.
 This implementation utilizes the new salted checksumming functionality
 in ZFS, which means that the checksum is pre-seeded with a secret
 256-bit random key
 .Pq stored on the pool
 before being fed the data block to be checksummed.
 Thus the produced checksums are unique to a given pool,
 preventing hash collision attacks on systems with dedup.
 .Pp
 .checksum-spiel edonr
 .
 .feature com.delphix embedded_data no
 This feature improves the performance and compression ratio of
 highly-compressible blocks.
 Blocks whose contents can compress to 112 bytes
 or smaller can take advantage of this feature.
 .Pp
 When this feature is enabled, the contents of highly-compressible blocks are
 stored in the block
 .Dq pointer
 itself
 .Po a misnomer in this case, as it contains
 the compressed data, rather than a pointer to its location on disk
 .Pc .
 Thus the space of the block
 .Pq one sector, typically 512 B or 4 KiB
 is saved, and no additional I/O is needed to read and write the data block.
 .
 \*[instant-never]
 .
 .feature com.delphix empty_bpobj yes
 This feature increases the performance of creating and using a large
 number of snapshots of a single filesystem or volume, and also reduces
 the disk space required.
 .Pp
 When there are many snapshots, each snapshot uses many Block Pointer
 Objects
 .Pq bpobjs
 to track blocks associated with that snapshot.
 However, in common use cases, most of these bpobjs are empty.
 This feature allows us to create each bpobj on-demand,
 thus eliminating the empty bpobjs.
 .Pp
 This feature is
 .Sy active
 while there are any filesystems, volumes,
 or snapshots which were created after enabling this feature.
 .
 .feature com.delphix enabled_txg yes
 Once this feature is enabled, ZFS records the transaction group number
 in which new features are enabled.
 This has no user-visible impact, but other features may depend on this feature.
 .Pp
 This feature becomes
 .Sy active
 as soon as it is enabled and will never return to being
 .Sy enabled .
 .
 .feature com.datto encryption no bookmark_v2 extensible_dataset
 This feature enables the creation and management of natively encrypted datasets.
 .Pp
 This feature becomes
 .Sy active
 when an encrypted dataset is created and will be returned to the
 .Sy enabled
 state when all datasets that use this feature are destroyed.
 .
 .feature com.klarasystems fast_dedup yes
 This feature allows more advanced deduplication features to be enabled on new
 dedup tables.
 .Pp
 This feature will be
 .Sy active
 when the first deduplicated block is written after a new dedup table is created
 (i.e. after a new pool creation, or new checksum used on a dataset with
 .Sy dedup
 enabled).
 It will be returned to the
 .Sy enabled
 state when all deduplicated blocks using it are freed.
 .
 .feature com.delphix extensible_dataset no
 This feature allows more flexible use of internal ZFS data structures,
 and exists for other features to depend on.
 .Pp
 This feature will be
 .Sy active
 when the first dependent feature uses it, and will be returned to the
 .Sy enabled
 state when all datasets that use this feature are destroyed.
 .
 .feature com.joyent filesystem_limits yes extensible_dataset
 This feature enables filesystem and snapshot limits.
 These limits can be used to control how many filesystems and/or snapshots
 can be created at the point in the tree on which the limits are set.
 .Pp
 This feature is
 .Sy active
 once either of the limit properties has been set on a dataset
 and will never return to being
 .Sy enabled .
 .
 .feature com.delphix head_errlog no
 This feature enables the upgraded version of errlog, which required an on-disk
 error log format change.
 Now the error log of each head dataset is stored separately in the zap object
 and keyed by the head id.
 With this feature enabled, every dataset affected by an error block is listed
 in the output of
 .Nm zpool Cm status .
 In case of encrypted filesystems with unloaded keys we are unable to check
 their snapshots or clones for errors and these will not be reported.
 An "access denied" error will be reported.
 .Pp
 \*[instant-never]
 .
 .feature com.delphix hole_birth no enabled_txg
 This feature has/had bugs, the result of which is that, if you do a
 .Nm zfs Cm send Fl i
 .Pq or Fl R , No since it uses Fl i
 from an affected dataset, the receiving party will not see any checksum
 or other errors, but the resulting destination snapshot
 will not match the source.
 Its use by
 .Nm zfs Cm send Fl i
 has been disabled by default
 .Po
 see
 .Sy send_holes_without_birth_time
 in
 .Xr zfs 4
 .Pc .
 .Pp
 This feature improves performance of incremental sends
 .Pq Nm zfs Cm send Fl i
 and receives for objects with many holes.
 The most common case of hole-filled objects is zvols.
 .Pp
 An incremental send stream from snapshot
 .Sy A No to snapshot Sy B
 contains information about every block that changed between
 .Sy A No and Sy B .
 Blocks which did not change between those snapshots can be
 identified and omitted from the stream using a piece of metadata called
 the
 .Dq block birth time ,
 but birth times are not recorded for holes
 .Pq blocks filled only with zeroes .
 Since holes created after
 .Sy A No cannot be distinguished from holes created before Sy A ,
 information about every hole in the entire filesystem or zvol
 is included in the send stream.
 .Pp
 For workloads where holes are rare this is not a problem.
 However, when incrementally replicating filesystems or zvols with many holes
 .Pq for example a zvol formatted with another filesystem
 a lot of time will be spent sending and receiving unnecessary information
 about holes that already exist on the receiving side.
 .Pp
 Once the
 .Sy hole_birth
 feature has been enabled the block birth times
 of all new holes will be recorded.
 Incremental sends between snapshots created after this feature is enabled
 will use this new metadata to avoid sending information about holes that
 already exist on the receiving side.
 .Pp
 \*[instant-never]
 .
 .feature org.open-zfs large_blocks no extensible_dataset
 This feature allows the record size on a dataset to be set larger than 128 KiB.
 .Pp
 This feature becomes
 .Sy active
 once a dataset contains a file with a block size larger than 128 KiB,
 and will return to being
 .Sy enabled
 once all filesystems that have ever had their recordsize larger than 128 KiB
 are destroyed.
 .
 .feature org.zfsonlinux large_dnode no extensible_dataset
 This feature allows the size of dnodes in a dataset to be set larger than 512 B.
 .
 This feature becomes
 .Sy active
 once a dataset contains an object with a dnode larger than 512 B,
 which occurs as a result of setting the
 .Sy dnodesize
 dataset property to a value other than
 .Sy legacy .
 The feature will return to being
 .Sy enabled
 once all filesystems that have ever contained a dnode larger than 512 B
 are destroyed.
 Large dnodes allow more data to be stored in the bonus buffer,
 thus potentially improving performance by avoiding the use of spill blocks.
 .
 .feature com.klarasystems large_microzap yes extensible_dataset large_blocks
 This feature allows "micro" ZAPs to grow larger than 128 KiB without being
 upgraded to "fat" ZAPs.
 .Pp
 This feature becomes
 .Sy active
 the first time a micro ZAP grows larger than 128KiB.
 It will only be returned to the
 .Sy enabled
 state when all datasets that ever had a large micro ZAP are destroyed.
 .Pp
 Note that even when this feature is enabled, micro ZAPs cannot grow larger
 than 128 KiB without also changing the
 .Sy zap_micro_max_size
 module parameter.
 See
 .Xr zfs 4 .
 .
 .feature com.delphix livelist yes extensible_dataset
 This feature allows clones to be deleted faster than the traditional method
 when a large number of random/sparse writes have been made to the clone.
 All blocks allocated and freed after a clone is created are tracked by the
 the clone's livelist which is referenced during the deletion of the clone.
 The feature is activated when a clone is created and remains
 .Sy active
 until all clones have been destroyed.
 .
 .feature com.delphix log_spacemap yes com.delphix:spacemap_v2
 This feature improves performance for heavily-fragmented pools,
 especially when workloads are heavy in random-writes.
 It does so by logging all the metaslab changes on a single spacemap every TXG
 instead of scattering multiple writes to all the metaslab spacemaps.
 .Pp
 \*[instant-never]
 .
 .feature org.zfsonlinux longname no extensible_dataset
 This feature allows creating files and directories with name up to 1023 bytes
 in length.
 A new dataset property
 .Sy longname
 is also introduced to toggle longname support for each dataset individually.
 This property can be disabled even if it contains longname files.
 In such case, new file cannot be created with longname but existing longname
 files can still be looked up.
 .Pp
 This feature becomes
 .Sy active
 when a file name greater than 255 is created in a dataset, and returns to
 being
 .Sy enabled
 when all such datasets are destroyed.
 .
 .feature org.illumos lz4_compress no
 .Sy lz4
 is a high-performance real-time compression algorithm that
 features significantly faster compression and decompression as well as a
 higher compression ratio than the older
 .Sy lzjb
 compression.
 Typically,
 .Sy lz4
 compression is approximately 50% faster on compressible data and 200% faster
 on incompressible data than
 .Sy lzjb .
 It is also approximately 80% faster on decompression,
 while giving approximately a 10% better compression ratio.
 .Pp
 When the
 .Sy lz4_compress
 feature is set to
 .Sy enabled ,
 the administrator can turn on
 .Sy lz4
 compression on any dataset on the pool using the
 .Xr zfs-set 8
 command.
 All newly written metadata will be compressed with the
 .Sy lz4
 algorithm.
 .Pp
 \*[instant-never]
 .
 .feature com.joyent multi_vdev_crash_dump no
 This feature allows a dump device to be configured with a pool comprised
 of multiple vdevs.
 Those vdevs may be arranged in any mirrored or raidz configuration.
 .Pp
 When the
 .Sy multi_vdev_crash_dump
 feature is set to
 .Sy enabled ,
 the administrator can use
 .Xr dumpadm 8
 to configure a dump device on a pool comprised of multiple vdevs.
 .Pp
 Under
 .Fx
 and Linux this feature is unused, but registered for compatibility.
 New pools created on these systems will have the feature
 .Sy enabled
 but will never transition to
 .Sy active ,
 as this functionality is not required for crash dump support.
 Existing pools where this feature is
 .Sy active
 can be imported.
 .
 .feature com.delphix obsolete_counts yes device_removal
 This feature is an enhancement of
 .Sy device_removal ,
 which will over time reduce the memory used to track removed devices.
 When indirect blocks are freed or remapped,
 we note that their part of the indirect mapping is
 .Dq obsolete
 – no longer needed.
 .Pp
 This feature becomes
 .Sy active
 when the
 .Nm zpool Cm remove
 command is used on a top-level vdev, and will never return to being
 .Sy enabled .
 .
 .feature com.truenas physical_rewrite yes extensible_dataset
 This feature enables physical block rewriting that preserves logical birth
 times, avoiding unnecessary inclusion of rewritten blocks in incremental
 .Nm zfs Cm send
 streams.
 When enabled, the
 .Nm zfs Cm rewrite Fl P
 command can be used.
 .Pp
 This feature becomes
 .Sy active
 the first time
 .Nm zfs Cm rewrite Fl P
 is used on any dataset, and will return to being
 .Sy enabled
 once all datasets that have ever used physical rewrite are destroyed.
 .
 .feature org.zfsonlinux project_quota yes extensible_dataset
 This feature allows administrators to account the spaces and objects usage
 information against the project identifier
 .Pq ID .
 .Pp
 The project ID is an object-based attribute.
 When upgrading an existing filesystem,
 objects without a project ID will be assigned a zero project ID.
 When this feature is enabled, newly created objects inherit
 their parent directories' project ID if the parent's inherit flag is set
 .Pq via Nm chattr Sy [+-]P No or Nm zfs Cm project Fl s Ns | Ns Fl C .
 Otherwise, the new object's project ID will be zero.
 An object's project ID can be changed at any time by the owner
 .Pq or privileged user
 via
 .Nm chattr Fl p Ar prjid
 or
 .Nm zfs Cm project Fl p Ar prjid .
 .Pp
 This feature will become
 .Sy active
 as soon as it is enabled and will never return to being
 .Sy disabled .
 \*[remount-upgrade]
 .
 .feature org.openzfs raidz_expansion no none
 This feature enables the
 .Nm zpool Cm attach
 subcommand to attach a new device to a RAID-Z group, expanding the total
 amount usable space in the pool.
 See
 .Xr zpool-attach 8 .
 .
 .feature com.delphix redaction_bookmarks no bookmarks extensible_dataset
 This feature enables the use of redacted
 .Nm zfs Cm send Ns s ,
 which create redaction bookmarks storing the list of blocks
 redacted by the send that created them.
 For more information about redacted sends, see
 .Xr zfs-send 8 .
 .
 .feature com.delphix redacted_datasets no extensible_dataset
 This feature enables the receiving of redacted
 .Nm zfs Cm send
 streams, which create redacted datasets when received.
 These datasets are missing some of their blocks,
 and so cannot be safely mounted, and their contents cannot be safely read.
 For more information about redacted receives, see
 .Xr zfs-send 8 .
 .
 .feature com.delphix redaction_list_spill no redaction_bookmarks
 This feature enables the redaction list created by zfs redact to store
 many more entries.
 It becomes
 .Sy active
 when a redaction list is created with more than 36 entries,
 and returns to being
 .Sy enabled
 when no long redaction lists remain in the pool.
 For more information about redacted sends, see
 .Xr zfs-send 8 .
 .
 .feature com.datto resilver_defer yes
 This feature allows ZFS to postpone new resilvers if an existing one is already
 in progress.
 Without this feature, any new resilvers will cause the currently
 running one to be immediately restarted from the beginning.
 .Pp
 This feature becomes
 .Sy active
 once a resilver has been deferred, and returns to being
 .Sy enabled
 when the deferred resilver begins.
 .
 .feature org.illumos sha512 no extensible_dataset
 This feature enables the use of the SHA-512/256 truncated hash algorithm
 .Pq FIPS 180-4
 for checksum and dedup.
 The native 64-bit arithmetic of SHA-512 provides an approximate 50%
 performance boost over SHA-256 on 64-bit hardware
 and is thus a good minimum-change replacement candidate
 for systems where hash performance is important,
 but these systems cannot for whatever reason utilize the faster
 .Sy skein No and Sy edonr
 algorithms.
 .Pp
 .checksum-spiel sha512
 .
 .feature org.illumos skein no extensible_dataset
 This feature enables the use of the Skein hash algorithm for checksum and dedup.
 Skein is a high-performance secure hash algorithm that was a
 finalist in the NIST SHA-3 competition.
 It provides a very high security margin and high performance on 64-bit hardware
 .Pq 80% faster than SHA-256 .
 This implementation also utilizes the new salted checksumming
 functionality in ZFS, which means that the checksum is pre-seeded with a
 secret 256-bit random key
 .Pq stored on the pool
 before being fed the data block to be checksummed.
 Thus the produced checksums are unique to a given pool,
 preventing hash collision attacks on systems with dedup.
 .Pp
 .checksum-spiel skein
 .
 .feature com.delphix spacemap_histogram yes
 This features allows ZFS to maintain more information about how free space
 is organized within the pool.
 If this feature is
 .Sy enabled ,
 it will be activated when a new space map object is created, or
 an existing space map is upgraded to the new format,
 and never returns back to being
 .Sy enabled .
 .
 .feature com.delphix spacemap_v2 yes
 This feature enables the use of the new space map encoding which
 consists of two words
 .Pq instead of one
 whenever it is advantageous.
 The new encoding allows space maps to represent large regions of
 space more efficiently on-disk while also increasing their maximum
 addressable offset.
 .Pp
 This feature becomes
 .Sy active
 once it is
 .Sy enabled ,
 and never returns back to being
 .Sy enabled .
 .
 .feature org.zfsonlinux userobj_accounting yes extensible_dataset
 This feature allows administrators to account the object usage information
 by user and group.
 .Pp
 \*[instant-never]
 \*[remount-upgrade]
 .
 .feature com.klarasystems vdev_zaps_v2 no
 This feature creates a ZAP object for the root vdev.
 .Pp
 This feature becomes active after the next
 .Nm zpool Cm import
 or
 .Nm zpool reguid .
 .
 Properties can be retrieved or set on the root vdev using
 .Nm zpool Cm get
 and
 .Nm zpool Cm set
 with
 .Sy root
 as the vdev name which is an alias for
 .Sy root-0 .
 .feature org.openzfs zilsaxattr yes extensible_dataset
 This feature enables
 .Sy xattr Ns = Ns Sy sa
 extended attribute logging in the ZIL.
 If enabled, extended attribute changes
 .Pq both Sy xattrdir Ns = Ns Sy dir No and Sy xattr Ns = Ns Sy sa
 are guaranteed to be durable if either the dataset had
 .Sy sync Ns = Ns Sy always
 set at the time the changes were made, or
 .Xr sync 2
 is called on the dataset after the changes were made.
 .Pp
 This feature becomes
 .Sy active
 when a ZIL is created for at least one dataset and will be returned to the
 .Sy enabled
 state when it is destroyed for all datasets that use this feature.
 .
 .feature com.delphix zpool_checkpoint yes
 This feature enables the
 .Nm zpool Cm checkpoint
 command that can checkpoint the state of the pool
 at the time it was issued and later rewind back to it or discard it.
 .Pp
 This feature becomes
 .Sy active
 when the
 .Nm zpool Cm checkpoint
 command is used to checkpoint the pool.
 The feature will only return back to being
 .Sy enabled
 when the pool is rewound or the checkpoint has been discarded.
 .
 .feature org.freebsd zstd_compress no extensible_dataset
 .Sy zstd
 is a high-performance compression algorithm that features a
 combination of high compression ratios and high speed.
 Compared to
 .Sy gzip ,
 .Sy zstd
 offers slightly better compression at much higher speeds.
 Compared to
 .Sy lz4 ,
 .Sy zstd
 offers much better compression while being only modestly slower.
 Typically,
 .Sy zstd
 compression speed ranges from 250 to 500 MB/s per thread
 and decompression speed is over 1 GB/s per thread.
 .Pp
 When the
 .Sy zstd
 feature is set to
 .Sy enabled ,
 the administrator can turn on
 .Sy zstd
 compression of any dataset using
 .Nm zfs Cm set Sy compress Ns = Ns Sy zstd Ar dset
 .Po see Xr zfs-set 8 Pc .
 This feature becomes
 .Sy active
 once a
 .Sy compress
 property has been set to
 .Sy zstd ,
 and will return to being
 .Sy enabled
 once all filesystems that have ever had their
 .Sy compress
 property set to
 .Sy zstd
 are destroyed.
 .El
 .
 .Sh SEE ALSO
 .Xr zfs 8 ,
 .Xr zpool 8
diff --git a/sys/contrib/openzfs/man/man7/zpoolconcepts.7 b/sys/contrib/openzfs/man/man7/zpoolconcepts.7
index dafe3bffc453..b9c8926d835d 100644
--- a/sys/contrib/openzfs/man/man7/zpoolconcepts.7
+++ b/sys/contrib/openzfs/man/man7/zpoolconcepts.7
@@ -1,509 +1,509 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd April 7, 2023
+.Dd August 6, 2025
 .Dt ZPOOLCONCEPTS 7
 .Os
 .
 .Sh NAME
 .Nm zpoolconcepts
 .Nd overview of ZFS storage pools
 .
 .Sh DESCRIPTION
 .Ss Virtual Devices (vdevs)
 A "virtual device" describes a single device or a collection of devices,
 organized according to certain performance and fault characteristics.
 The following virtual devices are supported:
 .Bl -tag -width "special"
 .It Sy disk
 A block device, typically located under
 .Pa /dev .
 ZFS can use individual slices or partitions, though the recommended mode of
 operation is to use whole disks.
 A disk can be specified by a full path, or it can be a shorthand name
 .Po the relative portion of the path under
 .Pa /dev
 .Pc .
 A whole disk can be specified by omitting the slice or partition designation.
 For example,
 .Pa sda
 is equivalent to
 .Pa /dev/sda .
 When given a whole disk, ZFS automatically labels the disk, if necessary.
 .It Sy file
 A regular file.
 The use of files as a backing store is strongly discouraged.
 It is designed primarily for experimental purposes, as the fault tolerance of a
 file is only as good as the file system on which it resides.
 A file must be specified by a full path.
 .It Sy mirror
 A mirror of two or more devices.
 Data is replicated in an identical fashion across all components of a mirror.
 A mirror with
 .Em N No disks of size Em X No can hold Em X No bytes and can withstand Em N-1
 devices failing, without losing data.
 .It Sy raidz , raidz1 , raidz2 , raidz3
 A distributed-parity layout, similar to RAID-5/6, with improved distribution of
 parity, and which does not suffer from the RAID-5/6
 .Qq write hole ,
 .Pq in which data and parity become inconsistent after a power loss .
 Data and parity is striped across all disks within a raidz group, though not
 necessarily in a consistent stripe width.
 .Pp
 A raidz group can have single, double, or triple parity, meaning that the
 raidz group can sustain one, two, or three failures, respectively, without
 losing any data.
 The
 .Sy raidz1
 vdev type specifies a single-parity raidz group; the
 .Sy raidz2
 vdev type specifies a double-parity raidz group; and the
 .Sy raidz3
 vdev type specifies a triple-parity raidz group.
 The
 .Sy raidz
 vdev type is an alias for
 .Sy raidz1 .
 .Pp
 A raidz group with
 .Em N No disks of size Em X No with Em P No parity disks can hold approximately
 .Em (N-P)*X No bytes and can withstand Em P No devices failing without losing data .
 The minimum number of devices in a raidz group is one more than the number of
 parity disks.
 The recommended number is between 3 and 9 to help increase performance.
 .It Sy draid , draid1 , draid2 , draid3
 A variant of raidz that provides integrated distributed hot spares, allowing
 for faster resilvering, while retaining the benefits of raidz.
 A dRAID vdev is constructed from multiple internal raidz groups, each with
 .Em D No data devices and Em P No parity devices .
 These groups are distributed over all of the children in order to fully
 utilize the available disk performance.
 .Pp
 Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with
 zeros) to allow fully sequential resilvering.
 This fixed stripe width significantly affects both usable capacity and IOPS.
 For example, with the default
 .Em D=8 No and Em 4 KiB No disk sectors the minimum allocation size is Em 32 KiB .
 If using compression, this relatively large allocation size can reduce the
 effective compression ratio.
 When using ZFS volumes (zvols) and dRAID, the default of the
 .Sy volblocksize
 property is increased to account for the allocation size.
 If a dRAID pool will hold a significant amount of small blocks, it is
 recommended to also add a mirrored
 .Sy special
 vdev to store those blocks.
 .Pp
 In regards to I/O, performance is similar to raidz since, for any read, all
 .Em D No data disks must be accessed .
 Delivered random IOPS can be reasonably approximated as
 .Sy floor((N-S)/(D+P))*single_drive_IOPS .
 .Pp
 Like raidz, a dRAID can have single-, double-, or triple-parity.
 The
 .Sy draid1 ,
 .Sy draid2 ,
 and
 .Sy draid3
 types can be used to specify the parity level.
 The
 .Sy draid
 vdev type is an alias for
 .Sy draid1 .
 .Pp
 A dRAID with
 .Em N No disks of size Em X , D No data disks per redundancy group , Em P
 .No parity level, and Em S No distributed hot spares can hold approximately
 .Em (N-S)*(D/(D+P))*X No bytes and can withstand Em P
 devices failing without losing data.
 .It Sy draid Ns Oo Ar parity Oc Ns Oo Sy \&: Ns Ar data Ns Sy d Oc Ns Oo Sy \&: Ns Ar children Ns Sy c Oc Ns Oo Sy \&: Ns Ar spares Ns Sy s Oc
 A non-default dRAID configuration can be specified by appending one or more
 of the following optional arguments to the
 .Sy draid
 keyword:
 .Bl -tag -compact -width "children"
 .It Ar parity
 The parity level (1-3).
 .It Ar data
 The number of data devices per redundancy group.
 In general, a smaller value of
 .Em D No will increase IOPS, improve the compression ratio ,
 and speed up resilvering at the expense of total usable capacity.
 Defaults to
 .Em 8 , No unless Em N-P-S No is less than Em 8 .
 .It Ar children
 The expected number of children.
 Useful as a cross-check when listing a large number of devices.
 An error is returned when the provided number of children differs.
 .It Ar spares
 The number of distributed hot spares.
 Defaults to zero.
 .El
 .It Sy spare
 A pseudo-vdev which keeps track of available hot spares for a pool.
 For more information, see the
 .Sx Hot Spares
 section.
 .It Sy log
 A separate intent log device.
 If more than one log device is specified, then writes are load-balanced between
 devices.
 Log devices can be mirrored.
 However, raidz vdev types are not supported for the intent log.
 For more information, see the
 .Sx Intent Log
 section.
 .It Sy dedup
 A device solely dedicated for deduplication tables.
 The redundancy of this device should match the redundancy of the other normal
 devices in the pool.
 If more than one dedup device is specified, then
 allocations are load-balanced between those devices.
 .It Sy special
 A device dedicated solely for allocating various kinds of internal metadata,
 and optionally small file blocks.
 The redundancy of this device should match the redundancy of the other normal
 devices in the pool.
 If more than one special device is specified, then
 allocations are load-balanced between those devices.
 .Pp
 For more information on special allocations, see the
 .Sx Special Allocation Class
 section.
 .It Sy cache
 A device used to cache storage pool data.
 A cache device cannot be configured as a mirror or raidz group.
 For more information, see the
 .Sx Cache Devices
 section.
 .El
 .Pp
 Virtual devices cannot be nested arbitrarily.
 A mirror, raidz or draid virtual device can only be created with files or disks.
 Mirrors of mirrors or other such combinations are not allowed.
 .Pp
 A pool can have any number of virtual devices at the top of the configuration
 .Po known as
 .Qq root vdevs
 .Pc .
 Data is dynamically distributed across all top-level devices to balance data
 among devices.
 As new virtual devices are added, ZFS automatically places data on the newly
 available devices.
 .Pp
 Virtual devices are specified one at a time on the command line,
 separated by whitespace.
 Keywords like
 .Sy mirror No and Sy raidz
 are used to distinguish where a group ends and another begins.
 For example, the following creates a pool with two root vdevs,
 each a mirror of two disks:
 .Dl # Nm zpool Cm create Ar mypool Sy mirror Ar sda sdb Sy mirror Ar sdc sdd
 .
 .Ss Device Failure and Recovery
 ZFS supports a rich set of mechanisms for handling device failure and data
 corruption.
 All metadata and data is checksummed, and ZFS automatically repairs bad data
 from a good copy, when corruption is detected.
 .Pp
 In order to take advantage of these features, a pool must make use of some form
 of redundancy, using either mirrored or raidz groups.
 While ZFS supports running in a non-redundant configuration, where each root
 vdev is simply a disk or file, this is strongly discouraged.
 A single case of bit corruption can render some or all of your data unavailable.
 .Pp
 A pool's health status is described by one of three states:
 .Sy online , degraded , No or Sy faulted .
 An online pool has all devices operating normally.
 A degraded pool is one in which one or more devices have failed, but the data is
 still available due to a redundant configuration.
 A faulted pool has corrupted metadata, or one or more faulted devices, and
 insufficient replicas to continue functioning.
 .Pp
 The health of the top-level vdev, such as a mirror or raidz device,
 is potentially impacted by the state of its associated vdevs
 or component devices.
 A top-level vdev or component device is in one of the following states:
 .Bl -tag -width "DEGRADED"
 .It Sy DEGRADED
 One or more top-level vdevs is in the degraded state because one or more
 component devices are offline.
 Sufficient replicas exist to continue functioning.
 .Pp
 One or more component devices is in the degraded or faulted state, but
 sufficient replicas exist to continue functioning.
 The underlying conditions are as follows:
 .Bl -bullet -compact
 .It
 The number of checksum errors or slow I/Os exceeds acceptable levels and the
 device is degraded as an indication that something may be wrong.
 ZFS continues to use the device as necessary.
 .It
 The number of I/O errors exceeds acceptable levels.
 The device could not be marked as faulted because there are insufficient
 replicas to continue functioning.
 .El
 .It Sy FAULTED
 One or more top-level vdevs is in the faulted state because one or more
 component devices are offline.
 Insufficient replicas exist to continue functioning.
 .Pp
 One or more component devices is in the faulted state, and insufficient
 replicas exist to continue functioning.
 The underlying conditions are as follows:
 .Bl -bullet -compact
 .It
 The device could be opened, but the contents did not match expected values.
 .It
 The number of I/O errors exceeds acceptable levels and the device is faulted to
 prevent further use of the device.
 .El
 .It Sy OFFLINE
 The device was explicitly taken offline by the
 .Nm zpool Cm offline
 command.
 .It Sy ONLINE
 The device is online and functioning.
 .It Sy REMOVED
 The device was physically removed while the system was running.
 Device removal detection is hardware-dependent and may not be supported on all
 platforms.
 .It Sy UNAVAIL
 The device could not be opened.
 If a pool is imported when a device was unavailable, then the device will be
 identified by a unique identifier instead of its path since the path was never
 correct in the first place.
 .El
 .Pp
 Checksum errors represent events where a disk returned data that was expected
 to be correct, but was not.
 In other words, these are instances of silent data corruption.
 The checksum errors are reported in
 .Nm zpool Cm status
 and
 .Nm zpool Cm events .
 When a block is stored redundantly, a damaged block may be reconstructed
 (e.g. from raidz parity or a mirrored copy).
 In this case, ZFS reports the checksum error against the disks that contained
 damaged data.
 If a block is unable to be reconstructed (e.g. due to 3 disks being damaged
 in a raidz2 group), it is not possible to determine which disks were silently
 corrupted.
 In this case, checksum errors are reported for all disks on which the block
 is stored.
 .Pp
 If a device is removed and later re-attached to the system,
 ZFS attempts to bring the device online automatically.
 Device attachment detection is hardware-dependent
 and might not be supported on all platforms.
 .
 .Ss Hot Spares
 ZFS allows devices to be associated with pools as
 .Qq hot spares .
 These devices are not actively used in the pool.
 But, when an active device
 fails, it is automatically replaced by a hot spare.
 To create a pool with hot spares, specify a
 .Sy spare
 vdev with any number of devices.
 For example,
 .Dl # Nm zpool Cm create Ar pool Sy mirror Ar sda sdb Sy spare Ar sdc sdd
 .Pp
 Spares can be shared across multiple pools, and can be added with the
 .Nm zpool Cm add
 command and removed with the
 .Nm zpool Cm remove
 command.
 Once a spare replacement is initiated, a new
 .Sy spare
 vdev is created within the configuration that will remain there until the
 original device is replaced.
 At this point, the hot spare becomes available again, if another device fails.
 .Pp
 If a pool has a shared spare that is currently being used, the pool cannot be
 exported, since other pools may use this shared spare, which may lead to
 potential data corruption.
 .Pp
 Shared spares add some risk.
 If the pools are imported on different hosts,
 and both pools suffer a device failure at the same time,
 both could attempt to use the spare at the same time.
 This may not be detected, resulting in data corruption.
 .Pp
 An in-progress spare replacement can be canceled by detaching the hot spare.
 If the original faulted device is detached, then the hot spare assumes its
 place in the configuration, and is removed from the spare list of all active
 pools.
 .Pp
 The
 .Sy draid
 vdev type provides distributed hot spares.
 These hot spares are named after the dRAID vdev they're a part of
 .Po Sy draid1 Ns - Ns Ar 2 Ns - Ns Ar 3 No specifies spare Ar 3 No of vdev Ar 2 ,
 .No which is a single parity dRAID Pc
 and may only be used by that dRAID vdev.
 Otherwise, they behave the same as normal hot spares.
 .Pp
 Spares cannot replace log devices.
 .
 .Ss Intent Log
 The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous
 transactions.
 For instance, databases often require their transactions to be on stable storage
 devices when returning from a system call.
 NFS and other applications can also use
 .Xr fsync 2
 to ensure data stability.
 By default, the intent log is allocated from blocks within the main pool.
 However, it might be possible to get better performance using separate intent
 log devices such as NVRAM or a dedicated disk.
 For example:
 .Dl # Nm zpool Cm create Ar pool sda sdb Sy log Ar sdc
 .Pp
 Multiple log devices can also be specified, and they can be mirrored.
 See the
 .Sx EXAMPLES
 section for an example of mirroring multiple log devices.
 .
 .Ss Cache Devices
 Devices can be added to a storage pool as
 .Qq cache devices .
 These devices provide an additional layer of caching between main memory and
 disk.
 For read-heavy workloads, where the working set size is much larger than what
 can be cached in main memory, using cache devices allows much more of this
 working set to be served from low latency media.
 Using cache devices provides the greatest performance improvement for random
 read-workloads of mostly static content.
 .Pp
 To create a pool with cache devices, specify a
 .Sy cache
 vdev with any number of devices.
 For example:
 .Dl # Nm zpool Cm create Ar pool sda sdb Sy cache Ar sdc sdd
 .Pp
 Cache devices cannot be mirrored or part of a raidz configuration.
 If a read error is encountered on a cache device, that read I/O is reissued to
 the original storage pool device, which might be part of a mirrored or raidz
 configuration.
 .Pp
 The content of the cache devices is persistent across reboots and restored
 asynchronously when importing the pool in L2ARC (persistent L2ARC).
 This can be disabled by setting
 .Sy l2arc_rebuild_enabled Ns = Ns Sy 0 .
 For cache devices smaller than
 .Em 1 GiB ,
 ZFS does not write the metadata structures
 required for rebuilding the L2ARC, to conserve space.
 This can be changed with
 .Sy l2arc_rebuild_blocks_min_l2size .
 The cache device header
 .Pq Em 512 B
 is updated even if no metadata structures are written.
 Setting
 .Sy l2arc_headroom Ns = Ns Sy 0
 will result in scanning the full-length ARC lists for cacheable content to be
 written in L2ARC (persistent ARC).
 If a cache device is added with
 .Nm zpool Cm add ,
 its label and header will be overwritten and its contents will not be
 restored in L2ARC, even if the device was previously part of the pool.
 If a cache device is onlined with
 .Nm zpool Cm online ,
 its contents will be restored in L2ARC.
 This is useful in case of memory pressure,
 where the contents of the cache device are not fully restored in L2ARC.
 The user can off- and online the cache device when there is less memory
 pressure, to fully restore its contents to L2ARC.
 .
 .Ss Pool checkpoint
 Before starting critical procedures that include destructive actions
 .Pq like Nm zfs Cm destroy ,
 an administrator can checkpoint the pool's state and, in the case of a
 mistake or failure, rewind the entire pool back to the checkpoint.
 Otherwise, the checkpoint can be discarded when the procedure has completed
 successfully.
 .Pp
 A pool checkpoint can be thought of as a pool-wide snapshot and should be used
 with care as it contains every part of the pool's state, from properties to vdev
 configuration.
 Thus, certain operations are not allowed while a pool has a checkpoint.
 Specifically, vdev removal/attach/detach, mirror splitting, and
 changing the pool's GUID.
 Adding a new vdev is supported, but in the case of a rewind it will have to be
 added again.
 Finally, users of this feature should keep in mind that scrubs in a pool that
 has a checkpoint do not repair checkpointed data.
 .Pp
 To create a checkpoint for a pool:
 .Dl # Nm zpool Cm checkpoint Ar pool
 .Pp
 To later rewind to its checkpointed state, you need to first export it and
 then rewind it during import:
 .Dl # Nm zpool Cm export Ar pool
 .Dl # Nm zpool Cm import Fl -rewind-to-checkpoint Ar pool
 .Pp
 To discard the checkpoint from a pool:
 .Dl # Nm zpool Cm checkpoint Fl d Ar pool
 .Pp
 Dataset reservations (controlled by the
 .Sy reservation No and Sy refreservation
 properties) may be unenforceable while a checkpoint exists, because the
 checkpoint is allowed to consume the dataset's reservation.
 Finally, data that is part of the checkpoint but has been freed in the
 current state of the pool won't be scanned during a scrub.
 .
 .Ss Special Allocation Class
 Allocations in the special class are dedicated to specific block types.
 By default, this includes all metadata, the indirect blocks of user data,
 intent log (in absence of separate log device), and deduplication tables.
 The class can also be provisioned to accept small file blocks or zvol blocks
 on a per dataset granularity.
 .Pp
 A pool must always have at least one normal
 .Pq non- Ns Sy dedup Ns /- Ns Sy special
 vdev before
 other devices can be assigned to the special class.
 If the
 .Sy special
 class becomes full, then allocations intended for it
 will spill back into the normal class.
 .Pp
 Deduplication tables can be excluded from the special class by unsetting the
 .Sy zfs_ddt_data_is_special
 ZFS module parameter.
 .Pp
 Inclusion of small file or zvol blocks in the special class is opt-in.
 Each dataset can control the size of small file blocks allowed
 in the special class by setting the
 .Sy special_small_blocks
 property to nonzero.
 See
 .Xr zfsprops 7
 for more info on this property.
diff --git a/sys/contrib/openzfs/man/man7/zpoolprops.7 b/sys/contrib/openzfs/man/man7/zpoolprops.7
index 5d84753193ee..d3b4c2376943 100644
--- a/sys/contrib/openzfs/man/man7/zpoolprops.7
+++ b/sys/contrib/openzfs/man/man7/zpoolprops.7
@@ -1,527 +1,527 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
 .\" Copyright (c) 2023, Klara Inc.
 .\"
-.Dd November 18, 2024
+.Dd December 4, 2024
 .Dt ZPOOLPROPS 7
 .Os
 .
 .Sh NAME
 .Nm zpoolprops
 .Nd properties of ZFS storage pools
 .
 .Sh DESCRIPTION
 Each pool has several properties associated with it.
 Some properties are read-only statistics while others are configurable and
 change the behavior of the pool.
 .Pp
 User properties have no effect on ZFS behavior.
 Use them to annotate pools in a way that is meaningful in your environment.
 For more information about user properties, see the
 .Sx User Properties
 section.
 .Pp
 The following are read-only properties:
 .Bl -tag -width "unsupported@guid"
 .It Sy allocated
 Amount of storage used within the pool.
 See
 .Sy fragmentation
 and
 .Sy free
 for more information.
 .It Sy bcloneratio
 The ratio of the total amount of storage that would be required to store all
 the cloned blocks without cloning to the actual storage used.
 The
 .Sy bcloneratio
 property is calculated as:
 .Pp
 .Sy ( ( bclonesaved + bcloneused ) * 100 ) / bcloneused
 .It Sy bclonesaved
 The amount of additional storage that would be required if block cloning
 was not used.
 .It Sy bcloneused
 The amount of storage used by cloned blocks.
 .It Sy capacity
 Percentage of pool space used.
 This property can also be referred to by its shortened column name,
 .Sy cap .
 .It Sy dedupcached
 Total size of the deduplication table currently loaded into the ARC.
 See
 .Xr zpool-prefetch 8 .
 .It Sy dedup_table_size
 Total on-disk size of the deduplication table.
 .It Sy expandsize
 Amount of uninitialized space within the pool or device that can be used to
 increase the total capacity of the pool.
 On whole-disk vdevs, this is the space beyond the end of the GPT –
 typically occurring when a LUN is dynamically expanded
 or a disk replaced with a larger one.
 On partition vdevs, this is the space appended to the partition after it was
 added to the pool – most likely by resizing it in-place.
 The space can be claimed for the pool by bringing it online with
 .Sy autoexpand=on
 or using
 .Nm zpool Cm online Fl e .
 .It Sy fragmentation
 The amount of fragmentation in the pool.
 As the amount of space
 .Sy allocated
 increases, it becomes more difficult to locate
 .Sy free
 space.
 This may result in lower write performance compared to pools with more
 unfragmented free space.
 .It Sy free
 The amount of free space available in the pool.
 By contrast, the
 .Xr zfs 8
 .Sy available
 property describes how much new data can be written to ZFS filesystems/volumes.
 The zpool
 .Sy free
 property is not generally useful for this purpose, and can be substantially more
 than the zfs
 .Sy available
 space.
 This discrepancy is due to several factors, including raidz parity;
 zfs reservation, quota, refreservation, and refquota properties; and space set
 aside by
 .Sy spa_slop_shift
 (see
 .Xr zfs 4
 for more information).
 .It Sy freeing
 After a file system or snapshot is destroyed, the space it was using is
 returned to the pool asynchronously.
 .Sy freeing
 is the amount of space remaining to be reclaimed.
 Over time
 .Sy freeing
 will decrease while
 .Sy free
 increases.
 .It Sy guid
 A unique identifier for the pool.
 .It Sy health
 The current health of the pool.
 Health can be one of
 .Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL .
 .It Sy last_scrubbed_txg
 Indicates the transaction group (TXG) up to which the most recent scrub
 operation has checked and repaired the dataset.
 This provides insight into the data integrity status of their pool at
 a specific point in time.
 .Xr zpool-scrub 8
 can utilize this property to scan only data that has changed since the last
 scrub completed, when given the
 .Fl C
 flag.
 This property is not updated when performing an error scrub with the
 .Fl e
 flag.
 .It Sy leaked
 Space not released while
 .Sy freeing
 due to corruption, now permanently leaked into the pool.
 .It Sy load_guid
 A unique identifier for the pool.
 Unlike the
 .Sy guid
 property, this identifier is generated every time we load the pool (i.e. does
 not persist across imports/exports) and never changes while the pool is loaded
 (even if a
 .Sy reguid
 operation takes place).
 .It Sy size
 Total size of the storage pool.
 .It Sy unsupported@ Ns Em guid
 Information about unsupported features that are enabled on the pool.
 See
 .Xr zpool-features 7
 for details.
 .El
 .Pp
 The space usage properties report actual physical space available to the
 storage pool.
 The physical space can be different from the total amount of space that any
 contained datasets can actually use.
 The amount of space used in a raidz configuration depends on the characteristics
 of the data being written.
 In addition, ZFS reserves some space for internal accounting that the
 .Xr zfs 8
 command takes into account, but the
 .Nm
 command does not.
 For non-full pools of a reasonable size, these effects should be invisible.
 For small pools, or pools that are close to being completely full, these
 discrepancies may become more noticeable.
 .Pp
 The following property can be set at creation time and import time:
 .Bl -tag -width Ds
 .It Sy altroot
 Alternate root directory.
 If set, this directory is prepended to any mount points within the pool.
 This can be used when examining an unknown pool where the mount points cannot be
 trusted, or in an alternate boot environment, where the typical paths are not
 valid.
 .Sy altroot
 is not a persistent property.
 It is valid only while the system is up.
 Setting
 .Sy altroot
 defaults to using
 .Sy cachefile Ns = Ns Sy none ,
 though this may be overridden using an explicit setting.
 .El
 .Pp
 The following property can be set only at import time:
 .Bl -tag -width Ds
 .It Sy readonly Ns = Ns Sy on Ns | Ns Sy off
 If set to
 .Sy on ,
 the pool will be imported in read-only mode.
 This property can also be referred to by its shortened column name,
 .Sy rdonly .
 .El
 .Pp
 The following properties can be set at creation time and import time, and later
 changed with the
 .Nm zpool Cm set
 command:
 .Bl -tag -width Ds
 .It Sy ashift Ns = Ns Ar ashift
 Pool sector size exponent, to the power of
 .Sy 2
 (internally referred to as
 .Sy ashift ) .
 Values from 9 to 16, inclusive, are valid; also, the
 value 0 (the default) means to auto-detect using the kernel's block
 layer and a ZFS internal exception list.
 I/O operations will be aligned to the specified size boundaries.
 Additionally, the minimum (disk)
 write size will be set to the specified size, so this represents a
 space/performance trade-off.
 For optimal performance, the pool sector size should be greater than
 or equal to the sector size of the underlying disks.
 The typical case for setting this property is when
 performance is important and the underlying disks use 4KiB sectors but
 report 512B sectors to the OS (for compatibility reasons); in that
 case, set
 .Sy ashift Ns = Ns Sy 12
 (which is
 .Sy 1<<12 No = Sy 4096 ) .
 When set, this property is
 used as the default hint value in subsequent vdev operations (add,
 attach and replace).
 Changing this value will not modify any existing
 vdev, not even on disk replacement; however it can be used, for
 instance, to replace a dying 512B sectors disk with a newer 4KiB
 sectors device: this will probably result in bad performance but at the
 same time could prevent loss of data.
 .It Sy autoexpand Ns = Ns Sy on Ns | Ns Sy off
 Controls automatic pool expansion when the underlying LUN is grown.
 If set to
 .Sy on ,
 the pool will be resized according to the size of the expanded device.
 If the device is part of a mirror or raidz then all devices within that
 mirror/raidz group must be expanded before the new space is made available to
 the pool.
 The default behavior is
 .Sy off .
 This property can also be referred to by its shortened column name,
 .Sy expand .
 .It Sy autoreplace Ns = Ns Sy on Ns | Ns Sy off
 Controls automatic device replacement.
 If set to
 .Sy off ,
 device replacement must be initiated by the administrator by using the
 .Nm zpool Cm replace
 command.
 If set to
 .Sy on ,
 any new device, found in the same physical location as a device that previously
 belonged to the pool, is automatically formatted and replaced.
 The default behavior is
 .Sy off .
 This property can also be referred to by its shortened column name,
 .Sy replace .
 Autoreplace can also be used with virtual disks (like device
 mapper) provided that you use the /dev/disk/by-vdev paths setup by
 vdev_id.conf.
 See the
 .Xr vdev_id 8
 manual page for more details.
 Autoreplace and autoonline require the ZFS Event Daemon be configured and
 running.
 See the
 .Xr zed 8
 manual page for more details.
 .It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off
 When set to
 .Sy on
 space which has been recently freed, and is no longer allocated by the pool,
 will be periodically trimmed.
 This allows block device vdevs which support
 BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system
 supports hole-punching, to reclaim unused blocks.
 The default value for this property is
 .Sy off .
 .Pp
 Automatic TRIM does not immediately reclaim blocks after a free.
 Instead, it will optimistically delay allowing smaller ranges to be aggregated
 into a few larger ones.
 These can then be issued more efficiently to the storage.
 TRIM on L2ARC devices is enabled by setting
 .Sy l2arc_trim_ahead > 0 .
 .Pp
 Be aware that automatic trimming of recently freed data blocks can put
 significant stress on the underlying storage devices.
 This will vary depending of how well the specific device handles these commands.
 For lower-end devices it is often possible to achieve most of the benefits
 of automatic trimming by running an on-demand (manual) TRIM periodically
 using the
 .Nm zpool Cm trim
 command.
 .It Sy bootfs Ns = Ns Sy (unset) Ns | Ns Ar pool Ns Op / Ns Ar dataset
 Identifies the default bootable dataset for the root pool.
 This property is expected to be set mainly by the installation and upgrade
 programs.
 Not all Linux distribution boot processes use the bootfs property.
 .It Sy cachefile Ns = Ns Ar path Ns | Ns Sy none
 Controls the location of where the pool configuration is cached.
 Discovering all pools on system startup requires a cached copy of the
 configuration data that is stored on the root file system.
 All pools in this cache are automatically imported when the system boots.
 Some environments, such as install and clustering, need to cache this
 information in a different location so that pools are not automatically
 imported.
 Setting this property caches the pool configuration in a different location that
 can later be imported with
 .Nm zpool Cm import Fl c .
 Setting it to the value
 .Sy none
 creates a temporary pool that is never cached, and the
 .Qq
 .Pq empty string
 uses the default location.
 .Pp
 Multiple pools can share the same cache file.
 Because the kernel destroys and recreates this file when pools are added and
 removed, care should be taken when attempting to access this file.
 When the last pool using a
 .Sy cachefile
 is exported or destroyed, the file will be empty.
 .It Sy comment Ns = Ns Ar text
 A text string consisting of printable ASCII characters that will be stored
 such that it is available even if the pool becomes faulted.
 An administrator can provide additional information about a pool using this
 property.
 .It Sy compatibility Ns = Ns Sy off Ns | Ns Sy legacy Ns | Ns Ar file Ns Oo , Ns Ar file Oc Ns …
 Specifies that the pool maintain compatibility with specific feature sets.
 When set to
 .Sy off
 (or unset) compatibility is disabled (all features may be enabled); when set to
 .Sy legacy
 no features may be enabled.
 When set to a comma-separated list of filenames
 (each filename may either be an absolute path, or relative to
 .Pa /etc/zfs/compatibility.d
 or
 .Pa /usr/share/zfs/compatibility.d )
 the lists of requested features are read from those files, separated by
 whitespace and/or commas.
 Only features present in all files may be enabled.
 .Pp
 See
 .Xr zpool-features 7 ,
 .Xr zpool-create 8
 and
 .Xr zpool-upgrade 8
 for more information on the operation of compatibility feature sets.
 .It Sy dedup_table_quota Ns = Ns Ar number Ns | Ns Sy none Ns | Ns Sy auto
 This property sets a limit on the on-disk size of the pool's dedup table.
 Entries will not be added to the dedup table once this size is reached;
 if a dedup table already exists, and is larger than this size, they
 will not be removed as part of setting this property.
 Existing entries will still have their reference counts updated.
 .Pp
 The actual size limit of the table may be above or below the quota,
 depending on the actual on-disk size of the entries (which may be
 approximated for purposes of calculating the quota).
 That is, setting a quota size of 1M may result in the maximum size being
 slightly below, or slightly above, that value.
 Set to
 .Sy 'none'
 to disable.
 In automatic mode, which is the default, the size of a dedicated dedup vdev
 is used as the quota limit.
 .Pp
 The
 .Sy dedup_table_quota
 property works for both legacy and fast dedup tables.
 .It Sy dedupditto Ns = Ns Ar number
 This property is deprecated and no longer has any effect.
 .It Sy delegation Ns = Ns Sy on Ns | Ns Sy off
 Controls whether a non-privileged user is granted access based on the dataset
 permissions defined on the dataset.
 See
 .Xr zfs 8
 for more information on ZFS delegated administration.
 .It Sy failmode Ns = Ns Sy wait Ns | Ns Sy continue Ns | Ns Sy panic
 Controls the system behavior in the event of catastrophic pool failure.
 This condition is typically a result of a loss of connectivity to the underlying
 storage device(s) or a failure of all devices within the pool.
 The behavior of such an event is determined as follows:
 .Bl -tag -width "continue"
 .It Sy wait
 Blocks all I/O access until the device connectivity is recovered and the errors
 are cleared with
 .Nm zpool Cm clear .
 This is the default behavior.
 .It Sy continue
 Returns
 .Er EIO
 to any new write I/O requests but allows reads to any of the remaining healthy
 devices.
 Any write requests that have yet to be committed to disk would be blocked.
 .It Sy panic
 Prints out a message to the console and generates a system crash dump.
 .El
 .It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled
 The value of this property is the current state of
 .Ar feature_name .
 The only valid value when setting this property is
 .Sy enabled
 which moves
 .Ar feature_name
 to the enabled state.
 See
 .Xr zpool-features 7
 for details on feature states.
 .It Sy listsnapshots Ns = Ns Sy on Ns | Ns Sy off
 Controls whether information about snapshots associated with this pool is
 output when
 .Nm zfs Cm list
 is run without the
 .Fl t
 option.
 The default value is
 .Sy off .
 This property can also be referred to by its shortened name,
 .Sy listsnaps .
 .It Sy multihost Ns = Ns Sy on Ns | Ns Sy off
 Controls whether a pool activity check should be performed during
 .Nm zpool Cm import .
 When a pool is determined to be active it cannot be imported, even with the
 .Fl f
 option.
 This property is intended to be used in failover configurations
 where multiple hosts have access to a pool on shared storage.
 .Pp
 Multihost provides protection on import only.
 It does not protect against an
 individual device being used in multiple pools, regardless of the type of vdev.
 See the discussion under
 .Nm zpool Cm create .
 .Pp
 When this property is on, periodic writes to storage occur to show the pool is
 in use.
 See
 .Sy zfs_multihost_interval
 in the
 .Xr zfs 4
 manual page.
 In order to enable this property each host must set a unique hostid.
 See
 .Xr genhostid 1
 .Xr zgenhostid 8
 .Xr spl 4
 for additional details.
 The default value is
 .Sy off .
 .It Sy version Ns = Ns Ar version
 The current on-disk version of the pool.
 This can be increased, but never decreased.
 The preferred method of updating pools is with the
 .Nm zpool Cm upgrade
 command, though this property can be used when a specific version is needed for
 backwards compatibility.
 Once feature flags are enabled on a pool this property will no longer have a
 value.
 .El
 .
 .Ss User Properties
 In addition to the standard native properties, ZFS supports arbitrary user
 properties.
 User properties have no effect on ZFS behavior, but applications or
 administrators can use them to annotate pools.
 .Pp
 User property names must contain a colon
 .Pq Qq Sy \&:
 character to distinguish them from native properties.
 They may contain lowercase letters, numbers, and the following punctuation
 characters: colon
 .Pq Qq Sy \&: ,
 dash
 .Pq Qq Sy - ,
 period
 .Pq Qq Sy \&. ,
 and underscore
 .Pq Qq Sy _ .
 The expected convention is that the property name is divided into two portions
 such as
 .Ar module : Ns Ar property ,
 but this namespace is not enforced by ZFS.
 User property names can be at most 255 characters, and cannot begin with a dash
 .Pq Qq Sy - .
 .Pp
 When making programmatic use of user properties, it is strongly suggested to use
 a reversed DNS domain name for the
 .Ar module
 component of property names to reduce the chance that two
 independently-developed packages use the same property name for different
 purposes.
 .Pp
 The values of user properties are arbitrary strings and
 are never validated.
 All of the commands that operate on properties
 .Po Nm zpool Cm list ,
 .Nm zpool Cm get ,
 .Nm zpool Cm set ,
 and so forth
 .Pc
 can be used to manipulate both native properties and user properties.
 Use
 .Nm zpool Cm set Ar name Ns =
 to clear a user property.
 Property values are limited to 8192 bytes.
diff --git a/sys/contrib/openzfs/man/man8/zdb.8 b/sys/contrib/openzfs/man/man8/zdb.8
index 0a5b6af73fdb..e00544e4a5a4 100644
--- a/sys/contrib/openzfs/man/man8/zdb.8
+++ b/sys/contrib/openzfs/man/man8/zdb.8
@@ -1,590 +1,590 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" This file and its contents are supplied under the terms of the
 .\" Common Development and Distribution License ("CDDL"), version 1.0.
 .\" You may only use this file in accordance with the terms of version
 .\" 1.0 of the CDDL.
 .\"
 .\" A full copy of the text of the CDDL should have accompanied this
 .\" source.  A copy of the CDDL is also available via the Internet at
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\" Copyright 2012, Richard Lowe.
 .\" Copyright (c) 2012, 2019 by Delphix. All rights reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
 .\" Copyright (c) 2017 Intel Corporation.
 .\"
-.Dd April 23, 2025
+.Dd August 12, 2025
 .Dt ZDB 8
 .Os
 .
 .Sh NAME
 .Nm zdb
 .Nd display ZFS storage pool debugging and consistency information
 .Sh SYNOPSIS
 .Nm
 .Op Fl AbcdDFGhikLMNPsTvXYy
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl I Ar inflight-I/O-ops
 .Oo Fl o Ar var Ns = Ns Ar value Oc Ns …
 .Op Fl t Ar txg
 .Op Fl U Ar cache
 .Op Fl x Ar dumpdir
 .Op Fl K Ar key
 .Op Ar poolname Ns Op / Ns Ar dataset Ns | Ns Ar objset-ID
 .Op Ar object Ns | Ns Ar range Ns …
 .Nm
 .Op Fl AdiPv
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl U Ar cache
 .Op Fl K Ar key
 .Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID
 .Op Ar object Ns | Ns Ar range Ns …
 .Nm
 .Fl B
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl U Ar cache
 .Op Fl K Ar key
 .Ar poolname Ns Ar / Ns Ar objset-ID
 .Op Ar backup-flags
 .Nm
 .Fl C
 .Op Fl A
 .Op Fl U Ar cache
 .Op Ar poolname
 .Nm
 .Fl E
 .Op Fl A
 .Ar word0 : Ns Ar word1 Ns :…: Ns Ar word15
 .Nm
 .Fl l
 .Op Fl Aqu
 .Ar device
 .Nm
 .Fl m
 .Op Fl AFLPXY
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl t Ar txg
 .Op Fl U Ar cache
 .Ar poolname Op Ar vdev Oo Ar metaslab Oc Ns …
 .Nm
 .Fl O
 .Op Fl K Ar key
 .Ar dataset path
 .Nm
 .Fl r
 .Op Fl K Ar key
 .Ar dataset path destination
 .Nm
 .Fl R
 .Op Fl A
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl U Ar cache
 .Ar poolname vdev : Ns Ar offset : Ns Oo Ar lsize Ns / Oc Ns Ar psize Ns Op : Ns Ar flags
 .Nm
 .Fl S
 .Op Fl AP
 .Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns …
 .Op Fl U Ar cache
 .Ar poolname
 .
 .Sh DESCRIPTION
 The
 .Nm
 utility displays information about a ZFS pool useful for debugging and performs
 some amount of consistency checking.
 It is a not a general purpose tool and options
 .Pq and facilities
 may change.
 It is not a
 .Xr fsck 8
 utility.
 .Pp
 The output of this command in general reflects the on-disk structure of a ZFS
 pool, and is inherently unstable.
 The precise output of most invocations is not documented, a knowledge of ZFS
 internals is assumed.
 .Pp
 If the
 .Ar dataset
 argument does not contain any
 .Qq Sy /
 or
 .Qq Sy @
 characters, it is interpreted as a pool name.
 The root dataset can be specified as
 .Qq Ar pool Ns / .
 .Pp
 .Nm
 is an
 .Qq offline
 tool; it accesses the block devices underneath the pools directly from
 userspace and does not care if the pool is imported or datasets are mounted
 (or even if the system understands ZFS at all).
 When operating on an imported and active pool it is possible, though unlikely,
 that zdb may interpret inconsistent pool data and behave erratically.
 .
 .Sh OPTIONS
 Display options:
 .Bl -tag -width Ds
 .It Fl b , -block-stats
 Display statistics regarding the number, size
 .Pq logical, physical and allocated
 and deduplication of blocks.
 .It Fl B , -backup
 Generate a backup stream, similar to
 .Nm zfs Cm send ,
 but for the numeric objset ID, and without opening the dataset.
 This can be useful in recovery scenarios if dataset metadata has become
 corrupted but the dataset itself is readable.
 The optional
 .Ar flags
 argument is a string of one or more of the letters
 .Sy e ,
 .Sy L ,
 .Sy c ,
 and
 .Sy w ,
 which correspond to the same flags in
 .Xr zfs-send 8 .
 .It Fl c , -checksum
 Verify the checksum of all metadata blocks while printing block statistics
 .Po see
 .Fl b
 .Pc .
 .Pp
 If specified multiple times, verify the checksums of all blocks.
 .It Fl C , -config
 Display information about the configuration.
 If specified with no other options, instead display information about the cache
 file
 .Pq Pa /etc/zfs/zpool.cache .
 To specify the cache file to display, see
 .Fl U .
 .Pp
 If specified multiple times, and a pool name is also specified display both the
 cached configuration and the on-disk configuration.
 If specified multiple times with
 .Fl e
 also display the configuration that would be used were the pool to be imported.
 .It Fl d , -datasets
 Display information about datasets.
 Specified once, displays basic dataset information: ID, create transaction,
 size, and object count.
 See
 .Fl N
 for determining if
 .Ar poolname Ns Op / Ns Ar dataset Ns | Ns Ar objset-ID
 is to use the specified
 .Ar dataset Ns | Ns Ar objset-ID
 as a string (dataset name) or a number (objset ID) when
 datasets have numeric names.
 .Pp
 If specified multiple times provides greater and greater verbosity.
 .Pp
 If object IDs or object ID ranges are specified, display information about
 those specific objects or ranges only.
 .Pp
 An object ID range is specified in terms of a colon-separated tuple of
 the form
 .Ao start Ac : Ns Ao end Ac Ns Op : Ns Ao flags Ac .
 The fields
 .Ar start
 and
 .Ar end
 are integer object identifiers that denote the upper and lower bounds
 of the range.
 An
 .Ar end
 value of -1 specifies a range with no upper bound.
 The
 .Ar flags
 field optionally specifies a set of flags, described below, that control
 which object types are dumped.
 By default, all object types are dumped.
 A minus sign
 .Pq -
 negates the effect of the flag that follows it and has no effect unless
 preceded by the
 .Ar A
 flag.
 For example, the range 0:-1:A-d will dump all object types except for
 directories.
 .Pp
 .Bl -tag -compact -width Ds
 .It Sy A
 Dump all objects (this is the default)
 .It Sy d
 Dump ZFS directory objects
 .It Sy f
 Dump ZFS plain file objects
 .It Sy m
 Dump SPA space map objects
 .It Sy z
 Dump ZAP objects
 .It Sy -
 Negate the effect of next flag
 .El
 .It Fl D , -dedup-stats
 Display deduplication statistics, including the deduplication ratio
 .Pq Sy dedup ,
 compression ratio
 .Pq Sy compress ,
 inflation due to the zfs copies property
 .Pq Sy copies ,
 and an overall effective ratio
 .Pq Sy dedup No \(mu Sy compress No / Sy copies .
 .It Fl DD
 Display a histogram of deduplication statistics, showing the allocated
 .Pq physically present on disk
 and referenced
 .Pq logically referenced in the pool
 block counts and sizes by reference count.
 .It Fl DDD
 Display the statistics independently for each deduplication table.
 .It Fl DDDD
 Dump the contents of the deduplication tables describing duplicate blocks.
 .It Fl DDDDD
 Also dump the contents of the deduplication tables describing unique blocks.
 .It Fl E , -embedded-block-pointer Ns = Ns Ar word0 : Ns Ar word1 Ns :…: Ns Ar word15
 Decode and display block from an embedded block pointer specified by the
 .Ar word
 arguments.
 .It Fl h , -history
 Display pool history similar to
 .Nm zpool Cm history ,
 but include internal changes, transaction, and dataset information.
 .It Fl i , -intent-logs
 Display information about intent log
 .Pq ZIL
 entries relating to each dataset.
 If specified multiple times, display counts of each intent log transaction type.
 .It Fl k , -checkpointed-state
 Examine the checkpointed state of the pool.
 Note, the on disk format of the pool is not reverted to the checkpointed state.
 .It Fl l , -label Ns = Ns Ar device
 Read the vdev labels and L2ARC header from the specified device.
 .Nm Fl l
 will return 0 if valid label was found, 1 if error occurred, and 2 if no valid
 labels were found.
 The presence of L2ARC header is indicated by a specific
 sequence (L2ARC_DEV_HDR_MAGIC).
 If there is an accounting error in the size or the number of L2ARC log blocks
 .Nm Fl l
 will return 1.
 Each unique configuration is displayed only once.
 .It Fl ll Ar device
 In addition display label space usage stats.
 If a valid L2ARC header was found
 also display the properties of log blocks used for restoring L2ARC contents
 (persistent L2ARC).
 .It Fl lll Ar device
 Display every configuration, unique or not.
 If a valid L2ARC header was found
 also display the properties of log entries in log blocks used for restoring
 L2ARC contents (persistent L2ARC).
 .Pp
 If the
 .Fl q
 option is also specified, don't print the labels or the L2ARC header.
 .Pp
 If the
 .Fl u
 option is also specified, also display the uberblocks on this device.
 Specify multiple times to increase verbosity.
 .It Fl L , -disable-leak-tracking
 Disable leak detection and the loading of space maps.
 By default,
 .Nm
 verifies that all non-free blocks are referenced, which can be very expensive.
 .It Fl m , -metaslabs
 Display the offset, spacemap, free space of each metaslab, all the log
 spacemaps and their obsolete entry statistics.
 .It Fl mm
 Also display information about the on-disk free space histogram associated with
 each metaslab.
 .It Fl mmm
 Display the maximum contiguous free space, the in-core free space histogram, and
 the percentage of free space in each space map.
 .It Fl mmmm
 Display every spacemap record.
 .It Fl M , -metaslab-groups
 Display all "normal" vdev metaslab group information - per-vdev metaslab count,
 fragmentation,
 and free space histogram, as well as overall pool fragmentation and histogram.
 .It Fl MM
 "Special" vdevs are added to -M's normal output.
 Also display information about the maximum contiguous free space and the
 percentage of free space in each space map.
 .It Fl MMM
 Display every spacemap record.
 .It Fl N
 Same as
 .Fl d
 but force zdb to interpret the
 .Op Ar dataset Ns | Ns Ar objset-ID
 in
 .Op Ar poolname Ns Op / Ns Ar dataset Ns | Ns Ar objset-ID
 as a numeric objset ID.
 .It Fl O , -object-lookups Ns = Ns Ar dataset path
 Look up the specified
 .Ar path
 inside of the
 .Ar dataset
 and display its metadata and indirect blocks.
 Specified
 .Ar path
 must be relative to the root of
 .Ar dataset .
 This option can be combined with
 .Fl v
 for increasing verbosity.
 .It Fl r , -copy-object Ns = Ns Ar dataset path destination
 Copy the specified
 .Ar path
 inside of the
 .Ar dataset
 to the specified destination.
 Specified
 .Ar path
 must be relative to the root of
 .Ar dataset .
 This option can be combined with
 .Fl v
 for increasing verbosity.
 .It Xo
 .Fl R , -read-block Ns = Ns Ar poolname vdev : Ns Ar offset : Ns Oo Ar lsize Ns / Oc Ns Ar psize Ns Op : Ns Ar flags
 .Xc
 Read and display a block from the specified device.
 By default the block is displayed as a hex dump, but see the description of the
 .Sy r
 flag, below.
 .Pp
 The block is specified in terms of a colon-separated tuple
 .Ar vdev
 .Pq an integer vdev identifier
 .Ar offset
 .Pq the offset within the vdev
 .Ar size
 .Pq the physical size, or logical size / physical size
 of the block to read and, optionally,
 .Ar flags
 .Pq a set of flags, described below .
 .Pp
 .Bl -tag -compact -width "b offset"
 .It Sy b Ar offset
 Print block pointer at hex offset
 .It Sy c
 Calculate and display checksums
 .It Sy d
 Decompress the block.
 Set environment variable
 .Nm ZDB_NO_ZLE
 to skip zle when guessing.
 .It Sy e
 Byte swap the block
 .It Sy g
 Dump gang block header
 .It Sy i
 Dump indirect block
 .It Sy r
 Dump raw uninterpreted block data
 .It Sy v
 Verbose output for guessing compression algorithm
 .El
 .It Fl s , -io-stats
 Report statistics on
 .Nm zdb
 I/O.
 Display operation counts, bandwidth, and error counts of I/O to the pool from
 .Nm .
 .It Fl S , -simulate-dedup
 Simulate the effects of deduplication, constructing a DDT and then display
 that DDT as with
 .Fl DD .
 .It Fl T , -brt-stats
 Display block reference table (BRT) statistics, including the size of uniques
 blocks cloned, the space saving as a result of cloning, and the saving ratio.
 .It Fl TT
 Display the per-vdev BRT statistics, including total references.
 .It Fl TTT
 Display histograms of per-vdev BRT refcounts.
 .It Fl TTTT
 Dump the contents of the block reference tables.
 .It Fl u , -uberblock
 Display the current uberblock.
 .El
 .Pp
 Other options:
 .Bl -tag -width Ds
 .It Fl A , -ignore-assertions
 Do not abort should any assertion fail.
 .It Fl AA
 Enable panic recovery, certain errors which would otherwise be fatal are
 demoted to warnings.
 .It Fl AAA
 Do not abort if asserts fail and also enable panic recovery.
 .It Fl e , -exported Ns = Ns Oo Fl p Ar path Oc Ns …
 Operate on an exported pool, not present in
 .Pa /etc/zfs/zpool.cache .
 The
 .Fl p
 flag specifies the path under which devices are to be searched.
 .It Fl x , -dump-blocks Ns = Ns Ar dumpdir
 All blocks accessed will be copied to files in the specified directory.
 The blocks will be placed in sparse files whose name is the same as
 that of the file or device read.
 .Nm
 can be then run on the generated files.
 Note that the
 .Fl bbc
 flags are sufficient to access
 .Pq and thus copy
 all metadata on the pool.
 .It Fl F , -automatic-rewind
 Attempt to make an unreadable pool readable by trying progressively older
 transactions.
 .It Fl G , -dump-debug-msg
 Dump the contents of the zfs_dbgmsg buffer before exiting
 .Nm .
 zfs_dbgmsg is a buffer used by ZFS to dump advanced debug information.
 .It Fl I , -inflight Ns = Ns Ar inflight-I/O-ops
 Limit the number of outstanding checksum I/O operations to the specified value.
 The default value is 200.
 This option affects the performance of the
 .Fl c
 option.
 .It Fl K , -key Ns = Ns Ar key
 Decryption key needed to access an encrypted dataset.
 This will cause
 .Nm
 to attempt to unlock the dataset using the encryption root, key format and other
 encryption parameters on the given dataset.
 .Nm
 can still inspect pool and dataset structures on encrypted datasets without
 unlocking them, but will not be able to access file names and attributes and
 object contents. \fBWARNING:\fP The raw decryption key and any decrypted data
 will be in user memory while
 .Nm
 is running.
 Other user programs may be able to extract it by inspecting
 .Nm
 as it runs.
 Exercise extreme caution when using this option in shared or uncontrolled
 environments.
 .It Fl o , -option Ns = Ns Ar var Ns = Ns Ar value Ns …
 Set the given tunable to the provided value.
 .It Fl o , -option Ns = Ns Ar var Ns …
 Show the value of the given tunable.
 .It Fl o , -option Ns = Ns show
 Show all tunables and their values.
 .It Fl o , -option Ns = Ns info Ns = Ns Ar value Ns …
 Show info about a tunable, including their name, type and description.
 .It Fl o , -option Ns = Ns info
 Show info about all tunables.
 .It Fl P , -parseable
 Print numbers in an unscaled form more amenable to parsing, e.g.\&
 .Sy 1000000
 rather than
 .Sy 1M .
 .It Fl t , -txg Ns = Ns Ar transaction
 Specify the highest transaction to use when searching for uberblocks.
 See also the
 .Fl u
 and
 .Fl l
 options for a means to see the available uberblocks and their associated
 transaction numbers.
 .It Fl U , -cachefile Ns = Ns Ar cachefile
 Use a cache file other than
 .Pa /etc/zfs/zpool.cache .
 .It Fl v , -verbose
 Enable verbosity.
 Specify multiple times for increased verbosity.
 .It Fl V , -verbatim
 Attempt verbatim import.
 This mimics the behavior of the kernel when loading a pool from a cachefile.
 Only usable with
 .Fl e .
 .It Fl X , -extreme-rewind
 Attempt
 .Qq extreme
 transaction rewind, that is attempt the same recovery as
 .Fl F
 but read transactions otherwise deemed too old.
 .It Fl Y , -all-reconstruction
 Attempt all possible combinations when reconstructing indirect split blocks.
 This flag disables the individual I/O deadman timer in order to allow as
 much time as required for the attempted reconstruction.
 .It Fl y , -livelist
 Perform validation for livelists that are being deleted.
 Scans through the livelist and metaslabs, checking for duplicate entries
 and compares the two, checking for potential double frees.
 If it encounters issues, warnings will be printed, but the command will not
 necessarily fail.
 .El
 .Pp
 Specifying a display option more than once enables verbosity for only that
 option, with more occurrences enabling more verbosity.
 .Pp
 If no options are specified, all information about the named pool will be
 displayed at default verbosity.
 .
 .Sh EXIT STATUS
 The
 .Nm
 utility exits
 .Sy 0
 on success,
 .Sy 1
 if a fatal error occurs,
 .Sy 2
 if invalid command line options were specified, or
 .Sy 3
 if on-disk corruption was detected, but was not fatal.
 .Sh EXAMPLES
 .Ss Example 1 : No Display the configuration of imported pool Ar rpool
 .Bd -literal
 .No # Nm zdb Fl C Ar rpool
 MOS Configuration:
         version: 28
         name: 'rpool'
  …
 .Ed
 .
 .Ss Example 2 : No Display basic dataset information about Ar rpool
 .Bd -literal
 .No # Nm zdb Fl d Ar rpool
 Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects
 Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects
  …
 .Ed
 .
 .Ss Example 3 : No Display basic information about object 0 in Ar rpool/export/home
 .Bd -literal
 .No # Nm zdb Fl d Ar rpool/export/home 0
 Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects
 
     Object  lvl   iblk   dblk  dsize  lsize   %full  type
          0    7    16K    16K  15.0K    16K   25.00  DMU dnode
 .Ed
 .
 .Ss Example 4 : No Display the predicted effect of enabling deduplication on Ar rpool
 .Bd -literal
 .No # Nm zdb Fl S Ar rpool
 Simulated DDT histogram:
 
 bucket              allocated                       referenced
 ______   ______________________________   ______________________________
 refcnt   blocks   LSIZE   PSIZE   DSIZE   blocks   LSIZE   PSIZE   DSIZE
 ------   ------   -----   -----   -----   ------   -----   -----   -----
      1     694K   27.1G   15.0G   15.0G     694K   27.1G   15.0G   15.0G
      2    35.0K   1.33G    699M    699M    74.7K   2.79G   1.45G   1.45G
  …
 dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs 8 ,
 .Xr zpool 8
diff --git a/sys/contrib/openzfs/man/man8/zed.8.in b/sys/contrib/openzfs/man/man8/zed.8.in
index c90a1834403b..eda377aafc1e 100644
--- a/sys/contrib/openzfs/man/man8/zed.8.in
+++ b/sys/contrib/openzfs/man/man8/zed.8.in
@@ -1,274 +1,274 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" This file is part of the ZFS Event Daemon (ZED).
 .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049).
 .\" Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC.
 .\" Refer to the OpenZFS git commit log for authoritative copyright attribution.
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License Version 1.0 (CDDL-1.0).
 .\" You can obtain a copy of the license from the top-level file
 .\" "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
 .\" You may not use this file except in compliance with the license.
 .\"
 .\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049)
 .\"
-.Dd May 26, 2021
+.Dd August 22, 2022
 .Dt ZED 8
 .Os
 .
 .Sh NAME
 .Nm ZED
 .Nd ZFS Event Daemon
 .Sh SYNOPSIS
 .Nm
 .Op Fl fFhILMvVZ
 .Op Fl d Ar zedletdir
 .Op Fl p Ar pidfile
 .Op Fl P Ar path
 .Op Fl s Ar statefile
 .Op Fl j Ar jobs
 .Op Fl b Ar buflen
 .
 .Sh DESCRIPTION
 The
 .Nm
 (ZFS Event Daemon) monitors events generated by the ZFS kernel
 module.
 When a zevent (ZFS Event) is posted, the
 .Nm
 will run any ZEDLETs (ZFS Event Daemon Linkage for Executable Tasks)
 that have been enabled for the corresponding zevent class.
 .
 .Sh OPTIONS
 .Bl -tag -width "-h"
 .It Fl h
 Display a summary of the command-line options.
 .It Fl L
 Display license information.
 .It Fl V
 Display version information.
 .It Fl v
 Be verbose.
 .It Fl f
 Force the daemon to run if at all possible, disabling security checks and
 throwing caution to the wind.
 Not recommended for use in production.
 .It Fl F
 Don't daemonize: remain attached to the controlling terminal,
 log to the standard I/O streams.
 .It Fl M
 Lock all current and future pages in the virtual memory address space.
 This may help the daemon remain responsive when the system is under heavy
 memory pressure.
 .It Fl I
 Request that the daemon idle rather than exit when the kernel modules are not
 loaded.
 Processing of events will start, or resume, when the kernel modules are
 (re)loaded.
 Under Linux the kernel modules cannot be unloaded while the daemon is running.
 .It Fl Z
 Zero the daemon's state, thereby allowing zevents still within the kernel
 to be reprocessed.
 .It Fl d Ar zedletdir
 Read the enabled ZEDLETs from the specified directory.
 .It Fl p Ar pidfile
 Write the daemon's process ID to the specified file.
 .It Fl P Ar path
 Custom
 .Ev $PATH
 for zedlets to use.
 Normally zedlets run in a locked-down environment, with hardcoded paths to the
 ZFS commands
 .Pq Ev $ZFS , $ZPOOL , $ZED , … ,
 and a hard-coded
 .Ev $PATH .
 This is done for security reasons.
 However, the ZFS test suite uses a custom PATH for its ZFS commands, and passes
 it to
 .Nm
 with
 .Fl P .
 In short,
 .Fl P
 is only to be used by the ZFS test suite; never use
 it in production!
 .It Fl s Ar statefile
 Write the daemon's state to the specified file.
 .It Fl j Ar jobs
 Allow at most
 .Ar jobs
 ZEDLETs to run concurrently,
 delaying execution of new ones until they finish.
 Defaults to
 .Sy 16 .
 .It Fl b Ar buflen
 Cap kernel event buffer growth to
 .Ar buflen
 entries.
 This buffer is grown when the daemon misses an event, but results in
 unreclaimable memory use in the kernel.
 A value of
 .Sy 0
 removes the cap.
 Defaults to
 .Sy 1048576 .
 .El
 .Sh ZEVENTS
 A zevent is comprised of a list of nvpairs (name/value pairs).
 Each zevent contains an EID (Event IDentifier) that uniquely identifies it
 throughout
 the lifetime of the loaded ZFS kernel module; this EID is a monotonically
 increasing integer that resets to 1 each time the kernel module is loaded.
 Each zevent also contains a class string that identifies the type of event.
 For brevity, a subclass string is defined that omits the leading components
 of the class string.
 Additional nvpairs exist to provide event details.
 .Pp
 The kernel maintains a list of recent zevents that can be viewed (along with
 their associated lists of nvpairs) using the
 .Nm zpool Cm events Fl v
 command.
 .
 .Sh CONFIGURATION
 ZEDLETs to be invoked in response to zevents are located in the
 .Em enabled-zedlets
 directory
 .Pq Ar zedletdir .
 These can be symlinked or copied from the
 .Em installed-zedlets
 directory; symlinks allow for automatic updates
 from the installed ZEDLETs, whereas copies preserve local modifications.
 As a security measure, since ownership change is a privileged operation,
 ZEDLETs must be owned by root.
 They must have execute permissions for the user,
 but they must not have write permissions for group or other.
 Dotfiles are ignored.
 .Pp
 ZEDLETs are named after the zevent class for which they should be invoked.
 In particular, a ZEDLET will be invoked for a given zevent if either its
 class or subclass string is a prefix of its filename (and is followed by
 a non-alphabetic character).
 As a special case, the prefix
 .Sy all
 matches all zevents.
 Multiple ZEDLETs may be invoked for a given zevent.
 .
 .Sh ZEDLETS
 ZEDLETs are executables invoked by the ZED in response to a given zevent.
 They should be written under the presumption they can be invoked concurrently,
 and they should use appropriate locking to access any shared resources.
 Common variables used by ZEDLETs can be stored in the default rc file which
 is sourced by scripts; these variables should be prefixed with
 .Sy ZED_ .
 .Pp
 The zevent nvpairs are passed to ZEDLETs as environment variables.
 Each nvpair name is converted to an environment variable in the following
 manner:
 .Bl -enum -compact
 .It
 it is prefixed with
 .Sy ZEVENT_ ,
 .It
 it is converted to uppercase, and
 .It
 each non-alphanumeric character is converted to an underscore.
 .El
 .Pp
 Some additional environment variables have been defined to present certain
 nvpair values in a more convenient form.
 An incomplete list of zevent environment variables is as follows:
 .Bl -tag -compact -width "ZEVENT_TIME_STRING"
 .It Sy ZEVENT_EID
 The Event IDentifier.
 .It Sy ZEVENT_CLASS
 The zevent class string.
 .It Sy ZEVENT_SUBCLASS
 The zevent subclass string.
 .It Sy ZEVENT_TIME
 The time at which the zevent was posted as
 .Dq Em seconds nanoseconds
 since the Epoch.
 .It Sy ZEVENT_TIME_SECS
 The
 .Em seconds
 component of
 .Sy ZEVENT_TIME .
 .It Sy ZEVENT_TIME_NSECS
 The
 .Em nanoseconds
 component of
 .Sy ZEVENT_TIME .
 .It Sy ZEVENT_TIME_STRING
 An almost-RFC3339-compliant string for
 .Sy ZEVENT_TIME .
 .El
 .Pp
 Additionally, the following ZED & ZFS variables are defined:
 .Bl -tag -compact -width "ZEVENT_TIME_STRING"
 .It Sy ZED_PID
 The daemon's process ID.
 .It Sy ZED_ZEDLET_DIR
 The daemon's current
 .Em enabled-zedlets
 directory.
 .It Sy ZFS_ALIAS
 The alias
 .Pq Dq Em name Ns - Ns Em version Ns - Ns Em release
 string of the ZFS distribution the daemon is part of.
 .It Sy ZFS_VERSION
 The ZFS version the daemon is part of.
 .It Sy ZFS_RELEASE
 The ZFS release the daemon is part of.
 .El
 .Pp
 ZEDLETs may need to call other ZFS commands.
 The installation paths of the following executables are defined as environment
 variables:
 .Sy ZDB ,
 .Sy ZED ,
 .Sy ZFS ,
 .Sy ZINJECT ,
 and
 .Sy ZPOOL .
 These variables may be overridden in the rc file.
 .
 .Sh FILES
 .Bl -tag -width "-c"
 .It Pa @sysconfdir@/zfs/zed.d
 The default directory for enabled ZEDLETs.
 .It Pa @sysconfdir@/zfs/zed.d/zed.rc
 The default rc file for common variables used by ZEDLETs.
 .It Pa @zfsexecdir@/zed.d
 The default directory for installed ZEDLETs.
 .It Pa @runstatedir@/zed.pid
 The default file containing the daemon's process ID.
 .It Pa @runstatedir@/zed.state
 The default file containing the daemon's state.
 .El
 .
 .Sh SIGNALS
 .Bl -tag -width "-c"
 .It Sy SIGHUP
 Reconfigure the daemon and rescan the directory for enabled ZEDLETs.
 .It Sy SIGTERM , SIGINT
 Terminate the daemon.
 .El
 .
 .Sh SEE ALSO
 .Xr zfs 8 ,
 .Xr zpool 8 ,
 .Xr zpool-events 8
 .
 .Sh NOTES
 The
 .Nm
 requires root privileges.
 .Pp
 Do not taunt the
 .Nm .
 .
 .Sh BUGS
 ZEDLETs are unable to return state/status information to the kernel.
 .Pp
 Internationalization support via gettext has not been added.
diff --git a/sys/contrib/openzfs/man/man8/zfs-allow.8 b/sys/contrib/openzfs/man/man8/zfs-allow.8
index 5a8e80bf6a43..b154aebd92aa 100644
--- a/sys/contrib/openzfs/man/man8/zfs-allow.8
+++ b/sys/contrib/openzfs/man/man8/zfs-allow.8
@@ -1,494 +1,494 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd March 13, 2025
 .Dt ZFS-ALLOW 8
 .Os
 .
 .Sh NAME
 .Nm zfs-allow
 .Nd delegate ZFS administration permissions to unprivileged users
 .Sh SYNOPSIS
 .Nm zfs
 .Cm allow
 .Op Fl dglu
 .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns …
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm allow
 .Op Fl dl
 .Fl e Ns | Ns Sy everyone
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm allow
 .Fl c
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm allow
 .Fl s No @ Ns Ar setname
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm unallow
 .Op Fl dglru
 .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns …
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm unallow
 .Op Fl dlr
 .Fl e Ns | Ns Sy everyone
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm unallow
 .Op Fl r
 .Fl c
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm unallow
 .Op Fl r
 .Fl s No @ Ns Ar setname
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm allow
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Displays permissions that have been delegated on the specified filesystem or
 volume.
 See the other forms of
 .Nm zfs Cm allow
 for more information.
 .Pp
 Delegations are supported under Linux with the exception of
 .Sy mount ,
 .Sy unmount ,
 .Sy mountpoint ,
 .Sy canmount ,
 .Sy rename ,
 and
 .Sy share .
 These permissions cannot be delegated because the Linux
 .Xr mount 8
 command restricts modifications of the global namespace to the root user.
 .It Xo
 .Nm zfs
 .Cm allow
 .Op Fl dglu
 .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns …
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 .It Xo
 .Nm zfs
 .Cm allow
 .Op Fl dl
 .Fl e Ns | Ns Sy everyone
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Delegates ZFS administration permission for the file systems to non-privileged
 users.
 .Bl -tag -width "-d"
 .It Fl d
 Allow only for the descendent file systems.
 .It Fl e Ns | Ns Sy everyone
 Specifies that the permissions be delegated to everyone.
 .It Fl g Ar group Ns Oo , Ns Ar group Oc Ns …
 Explicitly specify that permissions are delegated to the group.
 .It Fl l
 Allow
 .Qq locally
 only for the specified file system.
 .It Fl u Ar user Ns Oo , Ns Ar user Oc Ns …
 Explicitly specify that permissions are delegated to the user.
 .It Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns …
 Specifies to whom the permissions are delegated.
 Multiple entities can be specified as a comma-separated list.
 If neither of the
 .Fl gu
 options are specified, then the argument is interpreted preferentially as the
 keyword
 .Sy everyone ,
 then as a user name, and lastly as a group name.
 To specify a user or group named
 .Qq everyone ,
 use the
 .Fl g
 or
 .Fl u
 options.
 To specify a group with the same name as a user, use the
 .Fl g
 options.
 .It Xo
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Xc
 The permissions to delegate.
 Multiple permissions may be specified as a comma-separated list.
 Permission names are the same as ZFS subcommand and property names.
 See the property list below.
 Property set names, which begin with
 .Sy @ ,
 may be specified.
 See the
 .Fl s
 form below for details.
 .El
 .Pp
 If neither of the
 .Fl dl
 options are specified, or both are, then the permissions are allowed for the
 file system or volume, and all of its descendants.
 .Pp
 Permissions are generally the ability to use a ZFS subcommand or change a ZFS
 property.
 The following permissions are available:
 .TS
 l l l .
 NAME	TYPE	NOTES
 _	_	_
 allow	subcommand	Must also have the permission that is being allowed
 bookmark	subcommand
 clone	subcommand	Must also have the \fBcreate\fR ability and \fBmount\fR ability in the origin file system
 create	subcommand	Must also have the \fBmount\fR ability. Must also have the \fBrefreservation\fR ability to create a non-sparse volume.
 destroy	subcommand	Must also have the \fBmount\fR ability
 diff	subcommand	Allows lookup of paths within a dataset given an object number, and the ability to create snapshots necessary to \fBzfs diff\fR.
 hold	subcommand	Allows adding a user hold to a snapshot
 load-key	subcommand	Allows loading and unloading of encryption key (see \fBzfs load-key\fR and \fBzfs unload-key\fR).
 change-key	subcommand	Allows changing an encryption key via \fBzfs change-key\fR.
 mount	subcommand	Allows mounting/unmounting ZFS datasets
 promote	subcommand	Must also have the \fBmount\fR and \fBpromote\fR ability in the origin file system
 receive	subcommand	Must also have the \fBmount\fR and \fBcreate\fR ability, required for \fBzfs receive -F\fR (see also \fBreceive:append\fR for limited, non forced receive)
 release	subcommand	Allows releasing a user hold which might destroy the snapshot
 rename	subcommand	Must also have the \fBmount\fR and \fBcreate\fR ability in the new parent
 rollback	subcommand	Must also have the \fBmount\fR ability
 send	subcommand
 share	subcommand	Allows sharing file systems over NFS or SMB protocols
 snapshot	subcommand	Must also have the \fBmount\fR ability
 
 receive:append	other	Must also have the \fBmount\fR and \fBcreate\fR ability, limited receive ability (can not do receive -F)
 groupquota	other	Allows accessing any \fBgroupquota@\fI…\fR property
 groupobjquota	other	Allows accessing any \fBgroupobjquota@\fI…\fR property
 groupused	other	Allows reading any \fBgroupused@\fI…\fR property
 groupobjused	other	Allows reading any \fBgroupobjused@\fI…\fR property
 userprop	other	Allows changing any user property
 userquota	other	Allows accessing any \fBuserquota@\fI…\fR property
 userobjquota	other	Allows accessing any \fBuserobjquota@\fI…\fR property
 userused	other	Allows reading any \fBuserused@\fI…\fR property
 userobjused	other	Allows reading any \fBuserobjused@\fI…\fR property
 projectobjquota	other	Allows accessing any \fBprojectobjquota@\fI…\fR property
 projectquota	other	Allows accessing any \fBprojectquota@\fI…\fR property
 projectobjused	other	Allows reading any \fBprojectobjused@\fI…\fR property
 projectused	other	Allows reading any \fBprojectused@\fI…\fR property
 
 aclinherit	property
 aclmode	property
 acltype	property
 atime	property
 canmount	property
 casesensitivity	property
 checksum	property
 compression	property
 context	property
 copies	property
 dedup	property
 defcontext	property
 devices	property
 dnodesize	property
 encryption	property
 exec	property
 filesystem_limit	property
 fscontext	property
 keyformat	property
 keylocation	property
 logbias	property
 mlslabel	property
 mountpoint	property
 nbmand	property
 normalization	property
 overlay	property
 pbkdf2iters	property
 primarycache	property
 quota	property
 readonly	property
 recordsize	property
 redundant_metadata	property
 refquota	property
 refreservation	property
 relatime	property
 reservation	property
 rootcontext	property
 secondarycache	property
 setuid	property
 sharenfs	property
 sharesmb	property
 snapdev	property
 snapdir	property
 snapshot_limit	property
 special_small_blocks	property
 sync	property
 utf8only	property
 version	property
 volblocksize	property
 volmode	property
 volsize	property
 vscan	property
 xattr	property
 zoned	property
 .TE
 .It Xo
 .Nm zfs
 .Cm allow
 .Fl c
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Sets
 .Qq create time
 permissions.
 These permissions are granted
 .Pq locally
 to the creator of any newly-created descendent file system.
 .It Xo
 .Nm zfs
 .Cm allow
 .Fl s No @ Ns Ar setname
 .Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns …
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Defines or adds permissions to a permission set.
 The set can be used by other
 .Nm zfs Cm allow
 commands for the specified file system and its descendants.
 Sets are evaluated dynamically, so changes to a set are immediately reflected.
 Permission sets follow the same naming restrictions as ZFS file systems, but the
 name must begin with
 .Sy @ ,
 and can be no more than 64 characters long.
 .It Xo
 .Nm zfs
 .Cm unallow
 .Op Fl dglru
 .Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns …
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 .It Xo
 .Nm zfs
 .Cm unallow
 .Op Fl dlr
 .Fl e Ns | Ns Sy everyone
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 .It Xo
 .Nm zfs
 .Cm unallow
 .Op Fl r
 .Fl c
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Removes permissions that were granted with the
 .Nm zfs Cm allow
 command.
 No permissions are explicitly denied, so other permissions granted are still in
 effect.
 For example, if the permission is granted by an ancestor.
 If no permissions are specified, then all permissions for the specified
 .Ar user ,
 .Ar group ,
 or
 .Sy everyone
 are removed.
 Specifying
 .Sy everyone
 .Po or using the
 .Fl e
 option
 .Pc
 only removes the permissions that were granted to everyone, not all permissions
 for every user and group.
 See the
 .Nm zfs Cm allow
 command for a description of the
 .Fl ldugec
 options.
 .Bl -tag -width "-r"
 .It Fl r
 Recursively remove the permissions from this file system and all descendants.
 .El
 .It Xo
 .Nm zfs
 .Cm unallow
 .Op Fl r
 .Fl s No @ Ns Ar setname
 .Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns
 .Ar setname Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Removes permissions from a permission set.
 If no permissions are specified, then all permissions are removed, thus removing
 the set entirely.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 17, 18, 19, 20 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Delegating ZFS Administration Permissions on a ZFS Dataset
 The following example shows how to set permissions so that user
 .Ar cindys
 can create, destroy, mount, and take snapshots on
 .Ar tank/cindys .
 The permissions on
 .Ar tank/cindys
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Sy cindys create , Ns Sy destroy , Ns Sy mount , Ns Sy snapshot Ar tank/cindys
 .No # Nm zfs Cm allow Ar tank/cindys
 ---- Permissions on tank/cindys --------------------------------------
 Local+Descendent permissions:
         user cindys create,destroy,mount,snapshot
 .Ed
 .Pp
 Because the
 .Ar tank/cindys
 mount point permission is set to 755 by default, user
 .Ar cindys
 will be unable to mount file systems under
 .Ar tank/cindys .
 Add an ACE similar to the following syntax to provide mount point access:
 .Dl # Cm chmod No A+user : Ns Ar cindys Ns :add_subdirectory:allow Ar /tank/cindys
 .
 .Ss Example 2 : No Delegating Create Time Permissions on a ZFS Dataset
 The following example shows how to grant anyone in the group
 .Ar staff
 to create file systems in
 .Ar tank/users .
 This syntax also allows staff members to destroy their own file systems, but not
 destroy anyone else's file system.
 The permissions on
 .Ar tank/users
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Ar staff Sy create , Ns Sy mount Ar tank/users
 .No # Nm zfs Cm allow Fl c Sy destroy Ar tank/users
 .No # Nm zfs Cm allow Ar tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         destroy
 Local+Descendent permissions:
         group staff create,mount
 .Ed
 .
 .Ss Example 3 : No Defining and Granting a Permission Set on a ZFS Dataset
 The following example shows how to define and grant a permission set on the
 .Ar tank/users
 file system.
 The permissions on
 .Ar tank/users
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Fl s No @ Ns Ar pset Sy create , Ns Sy destroy , Ns Sy snapshot , Ns Sy mount Ar tank/users
 .No # Nm zfs Cm allow staff No @ Ns Ar pset tank/users
 .No # Nm zfs Cm allow Ar tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         @pset create,destroy,mount,snapshot
 Local+Descendent permissions:
         group staff @pset
 .Ed
 .
 .Ss Example 4 : No Delegating Property Permissions on a ZFS Dataset
 The following example shows to grant the ability to set quotas and reservations
 on the
 .Ar users/home
 file system.
 The permissions on
 .Ar users/home
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Ar cindys Sy quota , Ns Sy reservation Ar users/home
 .No # Nm zfs Cm allow Ar users/home
 ---- Permissions on users/home ---------------------------------------
 Local+Descendent permissions:
         user cindys quota,reservation
 cindys% zfs set quota=10G users/home/marks
 cindys% zfs get quota users/home/marks
 NAME              PROPERTY  VALUE  SOURCE
 users/home/marks  quota     10G    local
 .Ed
 .
 .Ss Example 5 : No Removing ZFS Delegated Permissions on a ZFS Dataset
 The following example shows how to remove the snapshot permission from the
 .Ar staff
 group on the
 .Sy tank/users
 file system.
 The permissions on
 .Sy tank/users
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm unallow Ar staff Sy snapshot Ar tank/users
 .No # Nm zfs Cm allow Ar tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         @pset create,destroy,mount,snapshot
 Local+Descendent permissions:
         group staff @pset
 .Ed
diff --git a/sys/contrib/openzfs/man/man8/zfs-bookmark.8 b/sys/contrib/openzfs/man/man8/zfs-bookmark.8
index 083ff46d241b..5a0933820020 100644
--- a/sys/contrib/openzfs/man/man8/zfs-bookmark.8
+++ b/sys/contrib/openzfs/man/man8/zfs-bookmark.8
@@ -1,76 +1,76 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\" Copyright (c) 2019, 2020 by Christian Schwarz. All Rights Reserved.
 .\"
-.Dd May 12, 2022
+.Dd July 11, 2022
 .Dt ZFS-BOOKMARK 8
 .Os
 .
 .Sh NAME
 .Nm zfs-bookmark
 .Nd create bookmark of ZFS snapshot
 .Sh SYNOPSIS
 .Nm zfs
 .Cm bookmark
 .Ar snapshot Ns | Ns Ar bookmark
 .Ar newbookmark
 .
 .Sh DESCRIPTION
 Creates a new bookmark of the given snapshot or bookmark.
 Bookmarks mark the point in time when the snapshot was created, and can be used
 as the incremental source for a
 .Nm zfs Cm send .
 .Pp
 When creating a bookmark from an existing redaction bookmark, the resulting
 bookmark is
 .Em not
 a redaction bookmark.
 .Pp
 This feature must be enabled to be used.
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy bookmarks
 feature.
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 23 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating a bookmark
 The following example creates a bookmark to a snapshot.
 This bookmark can then be used instead of a snapshot in send streams.
 .Dl # Nm zfs Cm bookmark Ar rpool Ns @ Ns Ar snapshot rpool Ns # Ns Ar bookmark
 .
 .Sh SEE ALSO
 .Xr zfs-destroy 8 ,
 .Xr zfs-send 8 ,
 .Xr zfs-snapshot 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-clone.8 b/sys/contrib/openzfs/man/man8/zfs-clone.8
index cd412815f5fe..9609cf2ce36a 100644
--- a/sys/contrib/openzfs/man/man8/zfs-clone.8
+++ b/sys/contrib/openzfs/man/man8/zfs-clone.8
@@ -1,97 +1,97 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZFS-CLONE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-clone
 .Nd clone snapshot of ZFS dataset
 .Sh SYNOPSIS
 .Nm zfs
 .Cm clone
 .Op Fl p
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Ar snapshot Ar filesystem Ns | Ns Ar volume
 .
 .Sh DESCRIPTION
 See the
 .Sx Clones
 section of
 .Xr zfsconcepts 7
 for details.
 The target dataset can be located anywhere in the ZFS hierarchy,
 and is created as the same type as the original.
 .Bl -tag -width Ds
 .It Fl o Ar property Ns = Ns Ar value
 Sets the specified property; see
 .Nm zfs Cm create
 for details.
 .It Fl p
 Creates all the non-existing parent datasets.
 Datasets created in this manner are automatically mounted according to the
 .Sy mountpoint
 property inherited from their parent.
 If the target filesystem or volume already exists, the operation completes
 successfully.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 9, 10 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating a ZFS Clone
 The following command creates a writable file system whose initial contents are
 the same as
 .Ar pool/home/bob@yesterday .
 .Dl # Nm zfs Cm clone Ar pool/home/bob@yesterday pool/clone
 .
 .Ss Example 2 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-promote 8 ,
 .Xr zfs-snapshot 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-create.8 b/sys/contrib/openzfs/man/man8/zfs-create.8
index 91878056cc7d..58bde5799240 100644
--- a/sys/contrib/openzfs/man/man8/zfs-create.8
+++ b/sys/contrib/openzfs/man/man8/zfs-create.8
@@ -1,280 +1,280 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd June 2, 2023
 .Dt ZFS-CREATE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-create
 .Nd create ZFS dataset
 .Sh SYNOPSIS
 .Nm zfs
 .Cm create
 .Op Fl Pnpuv
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Ar filesystem
 .Nm zfs
 .Cm create
 .Op Fl ps
 .Op Fl b Ar blocksize
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Fl V Ar size Ar volume
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm create
 .Op Fl Pnpuv
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Ar filesystem
 .Xc
 Creates a new ZFS file system.
 The file system is automatically mounted according to the
 .Sy mountpoint
 property inherited from the parent, unless the
 .Fl u
 option is used.
 .Bl -tag -width "-o"
 .It Fl o Ar property Ns = Ns Ar value
 Sets the specified property as if the command
 .Nm zfs Cm set Ar property Ns = Ns Ar value
 was invoked at the same time the dataset was created.
 Any editable ZFS property can also be set at creation time.
 Multiple
 .Fl o
 options can be specified.
 An error results if the same property is specified in multiple
 .Fl o
 options.
 .It Fl p
 Creates all the non-existing parent datasets.
 Datasets created in this manner are automatically mounted according to the
 .Sy mountpoint
 property inherited from their parent.
 Any property specified on the command line using the
 .Fl o
 option is ignored.
 If the target filesystem already exists, the operation completes successfully.
 .It Fl n
 Do a dry-run
 .Pq Qq No-op
 creation.
 No datasets will be created.
 This is useful in conjunction with the
 .Fl v
 or
 .Fl P
 flags to validate properties that are passed via
 .Fl o
 options and those implied by other options.
 The actual dataset creation can still fail due to insufficient privileges or
 available capacity.
 .It Fl P
 Print machine-parsable verbose information about the created dataset.
 Each line of output contains a key and one or two values, all separated by tabs.
 The
 .Sy create_ancestors
 and
 .Sy create
 keys have
 .Em filesystem
 as their only value.
 The
 .Sy create_ancestors
 key only appears if the
 .Fl p
 option is used.
 The
 .Sy property
 key has two values, a property name that property's value.
 The
 .Sy property
 key may appear zero or more times, once for each property that will be set local
 to
 .Em filesystem
 due to the use of the
 .Fl o
 option.
 .It Fl u
 Do not mount the newly created file system.
 .It Fl v
 Print verbose information about the created dataset.
 .El
 .It Xo
 .Nm zfs
 .Cm create
 .Op Fl ps
 .Op Fl b Ar blocksize
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Fl V Ar size Ar volume
 .Xc
 Creates a volume of the given size.
 The volume is exported as a block device in
 .Pa /dev/zvol/path ,
 where
 .Em path
 is the name of the volume in the ZFS namespace.
 The size represents the logical size as exported by the device.
 By default, a reservation of equal size is created.
 .Pp
 .Ar size
 is automatically rounded up to the nearest multiple of the
 .Sy blocksize .
 .Bl -tag -width "-b"
 .It Fl b Ar blocksize
 Equivalent to
 .Fl o Sy volblocksize Ns = Ns Ar blocksize .
 If this option is specified in conjunction with
 .Fl o Sy volblocksize ,
 the resulting behavior is undefined.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the specified property as if the
 .Nm zfs Cm set Ar property Ns = Ns Ar value
 command was invoked at the same time the dataset was created.
 Any editable ZFS property can also be set at creation time.
 Multiple
 .Fl o
 options can be specified.
 An error results if the same property is specified in multiple
 .Fl o
 options.
 .It Fl p
 Creates all the non-existing parent datasets.
 Datasets created in this manner are automatically mounted according to the
 .Sy mountpoint
 property inherited from their parent.
 Any property specified on the command line using the
 .Fl o
 option is ignored.
 If the target filesystem already exists, the operation completes successfully.
 .It Fl s
 Creates a sparse volume with no reservation.
 See
 .Sy volsize
 in the
 .Em Native Properties
 section of
 .Xr zfsprops 7
 for more information about sparse volumes.
 .It Fl n
 Do a dry-run
 .Pq Qq No-op
 creation.
 No datasets will be created.
 This is useful in conjunction with the
 .Fl v
 or
 .Fl P
 flags to validate properties that are passed via
 .Fl o
 options and those implied by other options.
 The actual dataset creation can still fail due to insufficient privileges or
 available capacity.
 .It Fl P
 Print machine-parsable verbose information about the created dataset.
 Each line of output contains a key and one or two values, all separated by tabs.
 The
 .Sy create_ancestors
 and
 .Sy create
 keys have
 .Em volume
 as their only value.
 The
 .Sy create_ancestors
 key only appears if the
 .Fl p
 option is used.
 The
 .Sy property
 key has two values, a property name that property's value.
 The
 .Sy property
 key may appear zero or more times, once for each property that will be set local
 to
 .Em volume
 due to the use of the
 .Fl b
 or
 .Fl o
 options, as well as
 .Sy refreservation
 if the volume is not sparse.
 .It Fl v
 Print verbose information about the created dataset.
 .El
 .El
 .Ss ZFS for Swap
 Swapping to a ZFS volume is prone to deadlock and not recommended.
 See OpenZFS FAQ.
 .Pp
 Swapping to a file on a ZFS filesystem is not supported.
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 1, 10 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating a ZFS File System Hierarchy
 The following commands create a file system named
 .Ar pool/home
 and a file system named
 .Ar pool/home/bob .
 The mount point
 .Pa /export/home
 is set for the parent file system, and is automatically inherited by the child
 file system.
 .Dl # Nm zfs Cm create Ar pool/home
 .Dl # Nm zfs Cm set Sy mountpoint Ns = Ns Ar /export/home pool/home
 .Dl # Nm zfs Cm create Ar pool/home/bob
 .
 .Ss Example 2 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-destroy 8 ,
 .Xr zfs-list 8 ,
 .Xr zpool-create 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-destroy.8 b/sys/contrib/openzfs/man/man8/zfs-destroy.8
index 38359be02430..6a6791f7a44e 100644
--- a/sys/contrib/openzfs/man/man8/zfs-destroy.8
+++ b/sys/contrib/openzfs/man/man8/zfs-destroy.8
@@ -1,236 +1,236 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd February 5, 2025
 .Dt ZFS-DESTROY 8
 .Os
 .
 .Sh NAME
 .Nm zfs-destroy
 .Nd destroy ZFS dataset, snapshots, or bookmark
 .Sh SYNOPSIS
 .Nm zfs
 .Cm destroy
 .Op Fl Rfnprv
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm destroy
 .Op Fl Rdnprv
 .Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns
 .Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns …
 .Nm zfs
 .Cm destroy
 .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm destroy
 .Op Fl Rfnprv
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 Destroys the given dataset.
 By default, the command unshares any file systems that are currently shared,
 unmounts any file systems that are currently mounted, and refuses to destroy a
 dataset that has active dependents
 .Pq children or clones .
 .Bl -tag -width "-R"
 .It Fl R
 Recursively destroy all dependents, including cloned file systems outside the
 target hierarchy.
 .It Fl f
 Forcibly unmount file systems.
 This option has no effect on non-file systems or unmounted file systems.
 .It Fl n
 Do a dry-run
 .Pq Qq No-op
 deletion.
 No data will be deleted.
 This is useful in conjunction with the
 .Fl v
 or
 .Fl p
 flags to determine what data would be deleted.
 .It Fl p
 Print machine-parsable verbose information about the deleted data.
 .It Fl r
 Recursively destroy all children.
 .It Fl v
 Print verbose information about the deleted data.
 .El
 .Pp
 Extreme care should be taken when applying either the
 .Fl r
 or the
 .Fl R
 options, as they can destroy large portions of a pool and cause unexpected
 behavior for mounted file systems in use.
 .It Xo
 .Nm zfs
 .Cm destroy
 .Op Fl Rdnprv
 .Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns
 .Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns …
 .Xc
 Attempts to destroy the given snapshot(s).
 This will fail if any clones of the snapshot exist or if the snapshot is held.
 In this case, by default,
 .Nm zfs Cm destroy
 will have no effect and exit in error.
 If the
 .Fl d
 option is applied, the command will instead mark the given snapshot for
 automatic destruction as soon as it becomes eligible.
 While marked for destruction, a snapshot remains visible, and the user may
 create new clones from it and place new holds on it.
 .Pp
 The read-only snapshot properties
 .Sy defer_destroy
 and
 .Sy userrefs
 are used by
 .Nm zfs Cm destroy
 to determine eligibility and marked status.
 .Pp
 An inclusive range of snapshots may be specified by separating the first and
 last snapshots with a percent sign.
 The first and/or last snapshots may be left blank, in which case the
 filesystem's oldest or newest snapshot will be implied.
 .Pp
 Multiple snapshots
 .Pq or ranges of snapshots
 of the same filesystem or volume may be specified in a comma-separated list of
 snapshots.
 Only the snapshot's short name
 .Po the part after the
 .Sy @
 .Pc
 should be specified when using a range or comma-separated list to identify
 multiple snapshots.
 .Bl -tag -width "-R"
 .It Fl R
 Recursively destroy all clones of these snapshots, including the clones,
 snapshots, and children.
 If this flag is specified, the
 .Fl d
 flag will have no effect.
 .It Fl d
 Rather than returning error if the given snapshot is ineligible for immediate
 destruction, mark it for deferred, automatic destruction once it becomes
 eligible.
 .It Fl n
 Do a dry-run
 .Pq Qq No-op
 deletion.
 No data will be deleted.
 This is useful in conjunction with the
 .Fl p
 or
 .Fl v
 flags to determine what data would be deleted.
 .It Fl p
 Print machine-parsable verbose information about the deleted data.
 .It Fl r
 Destroy
 .Pq or mark for deferred deletion
 all snapshots with this name in descendent file systems.
 .It Fl v
 Print verbose information about the deleted data.
 .El
 .Pp
 Extreme care should be taken when applying either the
 .Fl r
 or the
 .Fl R
 options, as they can destroy large portions of a pool and cause unexpected
 behavior for mounted file systems in use.
 .It Xo
 .Nm zfs
 .Cm destroy
 .Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark
 .Xc
 The given bookmark is destroyed.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 3, 10, 15 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating and Destroying Multiple Snapshots
 The following command creates snapshots named
 .Ar yesterday No of Ar pool/home
 and all of its descendent file systems.
 Each snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of its file system.
 The second command destroys the newly created snapshots.
 .Dl # Nm zfs Cm snapshot Fl r Ar pool/home Ns @ Ns Ar yesterday
 .Dl # Nm zfs Cm destroy Fl r Ar pool/home Ns @ Ns Ar yesterday
 .
 .Ss Example 2 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Ss Example 3 : No Performing a Rolling Snapshot
 The following example shows how to maintain a history of snapshots with a
 consistent naming scheme.
 To keep a week's worth of snapshots, the user destroys the oldest snapshot,
 renames the remaining snapshots, and then creates a new snapshot, as follows:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm destroy Fl r Ar pool/users@7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@6daysago No @ Ns Ar 7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@5daysago No @ Ns Ar 6daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@4daysago No @ Ns Ar 5daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@3daysago No @ Ns Ar 4daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@2daysago No @ Ns Ar 3daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@yesterday No @ Ns Ar 2daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@today No @ Ns Ar yesterday
 .No # Nm zfs Cm snapshot Fl r Ar pool/users Ns @ Ns Ar today
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-create 8 ,
 .Xr zfs-hold 8 ,
 .Xr zfsprops 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-diff.8 b/sys/contrib/openzfs/man/man8/zfs-diff.8
index d4c48f4109be..5b94ea524666 100644
--- a/sys/contrib/openzfs/man/man8/zfs-diff.8
+++ b/sys/contrib/openzfs/man/man8/zfs-diff.8
@@ -1,122 +1,122 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZFS-DIFF 8
 .Os
 .
 .Sh NAME
 .Nm zfs-diff
 .Nd show difference between ZFS snapshots
 .Sh SYNOPSIS
 .Nm zfs
 .Cm diff
 .Op Fl FHth
 .Ar snapshot Ar snapshot Ns | Ns Ar filesystem
 .
 .Sh DESCRIPTION
 Display the difference between a snapshot of a given filesystem and another
 snapshot of that filesystem from a later time or the current contents of the
 filesystem.
 The first column is a character indicating the type of change, the other columns
 indicate pathname, new pathname
 .Pq in case of rename ,
 change in link count, and optionally file type and/or change time.
 The types of change are:
 .Bl -tag -compact -offset Ds -width "M"
 .It Sy -
 The path has been removed
 .It Sy +
 The path has been created
 .It Sy M
 The path has been modified
 .It Sy R
 The path has been renamed
 .El
 .Bl -tag -width "-F"
 .It Fl F
 Display an indication of the type of file, in a manner similar to the
 .Fl F
 option of
 .Xr ls 1 .
 .Bl -tag -compact -offset 2n -width "B"
 .It Sy B
 Block device
 .It Sy C
 Character device
 .It Sy /
 Directory
 .It Sy >
 Door
 .It Sy |\&
 Named pipe
 .It Sy @
 Symbolic link
 .It Sy P
 Event port
 .It Sy =
 Socket
 .It Sy F
 Regular file
 .El
 .It Fl H
 Give more parsable tab-separated output, without header lines and without
 arrows.
 .It Fl t
 Display the path's inode change time as the first column of output.
 .It Fl h
 Do not
 .Sy \e0 Ns Ar ooo Ns -escape
 non-ASCII paths.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 22 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Showing the differences between a snapshot and a ZFS Dataset
 The following example shows how to see what has changed between a prior
 snapshot of a ZFS dataset and its current state.
 The
 .Fl F
 option is used to indicate type information for the files affected.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm diff Fl F Ar tank/test@before tank/test
 M       /       /tank/test/
 M       F       /tank/test/linked      (+1)
 R       F       /tank/test/oldname -> /tank/test/newname
 -       F       /tank/test/deleted
 +       F       /tank/test/created
 M       F       /tank/test/modified
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-snapshot 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-hold.8 b/sys/contrib/openzfs/man/man8/zfs-hold.8
index 0c88937f0dc8..a877e428f88b 100644
--- a/sys/contrib/openzfs/man/man8/zfs-hold.8
+++ b/sys/contrib/openzfs/man/man8/zfs-hold.8
@@ -1,115 +1,115 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd June 30, 2019
+.Dd November 8, 2022
 .Dt ZFS-HOLD 8
 .Os
 .
 .Sh NAME
 .Nm zfs-hold
 .Nd hold ZFS snapshots to prevent their removal
 .Sh SYNOPSIS
 .Nm zfs
 .Cm hold
 .Op Fl r
 .Ar tag Ar snapshot Ns …
 .Nm zfs
 .Cm holds
 .Op Fl rHp
 .Ar snapshot Ns …
 .Nm zfs
 .Cm release
 .Op Fl r
 .Ar tag Ar snapshot Ns …
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm hold
 .Op Fl r
 .Ar tag Ar snapshot Ns …
 .Xc
 Adds a single reference, named with the
 .Ar tag
 argument, to the specified snapshots.
 Each snapshot has its own tag namespace, and tags must be unique within that
 space.
 .Pp
 If a hold exists on a snapshot, attempts to destroy that snapshot by using the
 .Nm zfs Cm destroy
 command return
 .Sy EBUSY .
 .Bl -tag -width "-r"
 .It Fl r
 Specifies that a hold with the given tag is applied recursively to the snapshots
 of all descendent file systems.
 .El
 .It Xo
 .Nm zfs
 .Cm holds
 .Op Fl rHp
 .Ar snapshot Ns …
 .Xc
 Lists all existing user references for the given snapshot or snapshots.
 .Bl -tag -width "-r"
 .It Fl r
 Lists the holds that are set on the named descendent snapshots, in addition to
 listing the holds on the named snapshot.
 .It Fl H
 Do not print headers, use tab-delimited output.
 .It Fl p
 Prints holds timestamps as Unix epoch timestamps.
 .El
 .It Xo
 .Nm zfs
 .Cm release
 .Op Fl r
 .Ar tag Ar snapshot Ns …
 .Xc
 Removes a single reference, named with the
 .Ar tag
 argument, from the specified snapshot or snapshots.
 The tag must already exist for each snapshot.
 If a hold exists on a snapshot, attempts to destroy that snapshot by using the
 .Nm zfs Cm destroy
 command return
 .Sy EBUSY .
 .Bl -tag -width "-r"
 .It Fl r
 Recursively releases a hold with the given tag on the snapshots of all
 descendent file systems.
 .El
 .El
 .
 .Sh SEE ALSO
 .Xr zfs-destroy 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-jail.8 b/sys/contrib/openzfs/man/man8/zfs-jail.8
index 53499a279d05..569f5f57eab4 100644
--- a/sys/contrib/openzfs/man/man8/zfs-jail.8
+++ b/sys/contrib/openzfs/man/man8/zfs-jail.8
@@ -1,125 +1,125 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
 .\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
 .\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
 .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
 .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2022
 .Dt ZFS-JAIL 8
 .Os
 .
 .Sh NAME
 .Nm zfs-jail
 .Nd attach or detach ZFS filesystem from FreeBSD jail
 .Sh SYNOPSIS
 .Nm zfs Cm jail
 .Ar jailid Ns | Ns Ar jailname
 .Ar filesystem
 .Nm zfs Cm unjail
 .Ar jailid Ns | Ns Ar jailname
 .Ar filesystem
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm jail
 .Ar jailid Ns | Ns Ar jailname
 .Ar filesystem
 .Xc
 Attach the specified
 .Ar filesystem
 to the jail identified by JID
 .Ar jailid
 or name
 .Ar jailname .
 From now on this file system tree can be managed from within a jail if the
 .Sy jailed
 property has been set.
 To use this functionality, the jail needs the
 .Sy allow.mount
 and
 .Sy allow.mount.zfs
 parameters set to
 .Sy 1
 and the
 .Sy enforce_statfs
 parameter set to a value lower than
 .Sy 2 .
 .Pp
 You cannot attach a jailed dataset's children to another jail.
 You can also not attach the root file system
 of the jail or any dataset which needs to be mounted before the zfs rc script
 is run inside the jail, as it would be attached unmounted until it is
 mounted from the rc script inside the jail.
 .Pp
 To allow management of the dataset from within a jail, the
 .Sy jailed
 property has to be set and the jail needs access to the
 .Pa /dev/zfs
 device.
 The
 .Sy quota
 property cannot be changed from within a jail.
 .Pp
 After a dataset is attached to a jail and the
 .Sy jailed
 property is set, a jailed file system cannot be mounted outside the jail,
 since the jail administrator might have set the mount point to an unacceptable
 value.
 .Pp
 See
 .Xr jail 8
 for more information on managing jails.
 Jails are a
 .Fx
 feature and are not relevant on other platforms.
 .It Xo
 .Nm zfs
 .Cm unjail
 .Ar jailid Ns | Ns Ar jailname
 .Ar filesystem
 .Xc
 Detaches the specified
 .Ar filesystem
 from the jail identified by JID
 .Ar jailid
 or name
 .Ar jailname .
 .El
 .Sh SEE ALSO
 .Xr zfsprops 7 ,
 .Xr jail 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-list.8 b/sys/contrib/openzfs/man/man8/zfs-list.8
index 677d8292e207..42eff94f9762 100644
--- a/sys/contrib/openzfs/man/man8/zfs-list.8
+++ b/sys/contrib/openzfs/man/man8/zfs-list.8
@@ -1,354 +1,363 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd February 8, 2024
+.Dd August 25, 2025
 .Dt ZFS-LIST 8
 .Os
 .
 .Sh NAME
 .Nm zfs-list
 .Nd list properties of ZFS datasets
 .Sh SYNOPSIS
 .Nm zfs
 .Cm list
 .Op Fl r Ns | Ns Fl d Ar depth
 .Op Fl Hp
 .Op Fl j Op Ar --json-int
 .Oo Fl o Ar property Ns Oo , Ns Ar property Oc Ns … Oc
 .Oo Fl s Ar property Oc Ns …
 .Oo Fl S Ar property Oc Ns …
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns …
 .
 .Sh DESCRIPTION
-If specified, you can list property information by the absolute pathname or the
-relative pathname.
-By default, all file systems and volumes are displayed.
+By default, all file systems and volumes are displayed, with the following
+fields:
+.Sy name , Sy used , Sy available , Sy referenced , Sy mountpoint .
 Snapshots are displayed if the
 .Sy listsnapshots
 pool property is
 .Sy on
 .Po the default is
 .Sy off
-.Pc ,
+.Pc
 or if the
 .Fl t Sy snapshot
 or
 .Fl t Sy all
 options are specified.
-The following fields are displayed:
-.Sy name , Sy used , Sy available , Sy referenced , Sy mountpoint .
 .Bl -tag -width "-H"
 .It Fl H
 Used for scripting mode.
-Do not print headers and separate fields by a single tab instead of arbitrary
+Do not print headers, and separate fields by a single tab instead of arbitrary
 white space.
 .It Fl j , -json Op Ar --json-int
 Print the output in JSON format.
 Specify
 .Sy --json-int
 to print the numbers in integer format instead of strings in JSON output.
 .It Fl d Ar depth
 Recursively display any children of the dataset, limiting the recursion to
 .Ar depth .
 A
 .Ar depth
 of
 .Sy 1
 will display only the dataset and its direct children.
 .It Fl o Ar property
 A comma-separated list of properties to display.
-The property must be:
+Each property must be:
 .Bl -bullet -compact
 .It
 One of the properties described in the
 .Sx Native Properties
 section of
 .Xr zfsprops 7
 .It
 A user property
 .It
 The value
 .Sy name
 to display the dataset name
 .It
 The value
 .Sy space
 to display space usage properties on file systems and volumes.
 This is a shortcut for specifying
 .Fl o Ns \ \& Ns Sy name , Ns Sy avail , Ns Sy used , Ns Sy usedsnap , Ns
 .Sy usedds , Ns Sy usedrefreserv , Ns Sy usedchild
 .Fl t Sy filesystem , Ns Sy volume .
 .El
 .It Fl p
 Display numbers in parsable
 .Pq exact
 values.
 .It Fl r
 Recursively display any children of the dataset on the command line.
 .It Fl s Ar property
 A property for sorting the output by column in ascending order based on the
 value of the property.
 The property must be one of the properties described in the
 .Sx Properties
 section of
 .Xr zfsprops 7
 or the value
 .Sy name
 to sort by the dataset name.
-Multiple properties can be specified at one time using multiple
+Multiple properties can be specified to operate together using multiple
 .Fl s
-property options.
+or
+.Fl S
+options.
 Multiple
 .Fl s
-options are evaluated from left to right in decreasing order of importance.
-The following is a list of sorting criteria:
+and
+.Fl S
+options are evaluated from left to right to supply sort keys in
+decreasing order of priority.
+Property types operate as follows:
 .Bl -bullet -compact
 .It
 Numeric types sort in numeric order.
 .It
 String types sort in alphabetical order.
 .It
-Types inappropriate for a row sort that row to the literal bottom, regardless of
-the specified ordering.
+Types inappropriate for a row sort that row to the literal bottom,
+regardless of the specified ordering.
 .El
 .Pp
-If no sorting options are specified the existing behavior of
-.Nm zfs Cm list
-is preserved.
+If no sort columns are specified, or if two lines of output would sort
+equally across all specified columns, then datasets and bookmarks are
+sorted by name, whereas snapshots are sorted first by the name of their
+dataset and then by the time of their creation.
+When no sort columns are specified but snapshots are listed, this
+default behavior causes snapshots to be grouped under their datasets in
+chronological order by creation time.
 .It Fl S Ar property
 Same as
 .Fl s ,
-but sorts by property in descending order.
+but sorts by
+.Ar property
+in descending order.
 .It Fl t Ar type
 A comma-separated list of types to display, where
 .Ar type
 is one of
 .Sy filesystem ,
 .Sy snapshot ,
 .Sy volume ,
 .Sy bookmark ,
 or
 .Sy all .
 For example, specifying
 .Fl t Sy snapshot
 displays only snapshots.
 .Sy fs ,
 .Sy snap ,
 or
 .Sy vol
 can be used as aliases for
 .Sy filesystem ,
 .Sy snapshot ,
 or
 .Sy volume .
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 5 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Listing ZFS Datasets
 The following command lists all active file systems and volumes in the system.
 Snapshots are displayed if
 .Sy listsnaps Ns = Ns Sy on .
 The default is
 .Sy off .
 See
 .Xr zpoolprops 7
 for more information on pool properties.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm list
 NAME                      USED  AVAIL  REFER  MOUNTPOINT
 pool                      450K   457G    18K  /pool
 pool/home                 315K   457G    21K  /export/home
 pool/home/anne             18K   457G    18K  /export/home/anne
 pool/home/bob             276K   457G   276K  /export/home/bob
 .Ed
 .Ss Example 2 : No Listing ZFS filesystems and snapshots in JSON format
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm list Fl j Fl t Ar filesystem,snapshot | Cm jq
 {
   "output_version": {
     "command": "zfs list",
     "vers_major": 0,
     "vers_minor": 1
   },
   "datasets": {
     "pool": {
       "name": "pool",
       "type": "FILESYSTEM",
       "pool": "pool",
       "properties": {
         "used": {
           "value": "290K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "available": {
           "value": "30.5G",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "referenced": {
           "value": "24K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "mountpoint": {
           "value": "/pool",
           "source": {
             "type": "DEFAULT",
             "data": "-"
           }
         }
       }
     },
     "pool/home": {
       "name": "pool/home",
       "type": "FILESYSTEM",
       "pool": "pool",
       "properties": {
         "used": {
           "value": "48K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "available": {
           "value": "30.5G",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "referenced": {
           "value": "24K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "mountpoint": {
           "value": "/mnt/home",
           "source": {
             "type": "LOCAL",
             "data": "-"
           }
         }
       }
     },
     "pool/home/bob": {
       "name": "pool/home/bob",
       "type": "FILESYSTEM",
       "pool": "pool",
       "properties": {
         "used": {
           "value": "24K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "available": {
           "value": "30.5G",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "referenced": {
           "value": "24K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "mountpoint": {
           "value": "/mnt/home/bob",
           "source": {
             "type": "INHERITED",
             "data": "pool/home"
           }
         }
       }
     },
     "pool/home/bob@v1": {
       "name": "pool/home/bob@v1",
       "type": "SNAPSHOT",
       "pool": "pool",
       "dataset": "pool/home/bob",
       "snapshot_name": "v1",
       "properties": {
         "used": {
           "value": "0B",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "available": {
           "value": "-",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "referenced": {
           "value": "24K",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "mountpoint": {
           "value": "-",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         }
       }
     }
   }
 }
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfsprops 7 ,
 .Xr zfs-get 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-load-key.8 b/sys/contrib/openzfs/man/man8/zfs-load-key.8
index 7838c46d9e77..3a11cea99fd6 100644
--- a/sys/contrib/openzfs/man/man8/zfs-load-key.8
+++ b/sys/contrib/openzfs/man/man8/zfs-load-key.8
@@ -1,305 +1,305 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd January 13, 2020
+.Dd July 11, 2022
 .Dt ZFS-LOAD-KEY 8
 .Os
 .
 .Sh NAME
 .Nm zfs-load-key
 .Nd load, unload, or change encryption key of ZFS dataset
 .Sh SYNOPSIS
 .Nm zfs
 .Cm load-key
 .Op Fl nr
 .Op Fl L Ar keylocation
 .Fl a Ns | Ns Ar filesystem
 .Nm zfs
 .Cm unload-key
 .Op Fl r
 .Fl a Ns | Ns Ar filesystem
 .Nm zfs
 .Cm change-key
 .Op Fl l
 .Op Fl o Ar keylocation Ns = Ns Ar value
 .Op Fl o Ar keyformat Ns = Ns Ar value
 .Op Fl o Ar pbkdf2iters Ns = Ns Ar value
 .Ar filesystem
 .Nm zfs
 .Cm change-key
 .Fl i
 .Op Fl l
 .Ar filesystem
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm load-key
 .Op Fl nr
 .Op Fl L Ar keylocation
 .Fl a Ns | Ns Ar filesystem
 .Xc
 Load the key for
 .Ar filesystem ,
 allowing it and all children that inherit the
 .Sy keylocation
 property to be accessed.
 The key will be expected in the format specified by the
 .Sy keyformat
 and location specified by the
 .Sy keylocation
 property.
 Note that if the
 .Sy keylocation
 is set to
 .Sy prompt
 the terminal will interactively wait for the key to be entered.
 Loading a key will not automatically mount the dataset.
 If that functionality is desired,
 .Nm zfs Cm mount Fl l
 will ask for the key and mount the dataset
 .Po
 see
 .Xr zfs-mount 8
 .Pc .
 Once the key is loaded the
 .Sy keystatus
 property will become
 .Sy available .
 .Bl -tag -width "-r"
 .It Fl r
 Recursively loads the keys for the specified filesystem and all descendent
 encryption roots.
 .It Fl a
 Loads the keys for all encryption roots in all imported pools.
 .It Fl n
 Do a dry-run
 .Pq Qq No-op
 .Cm load-key .
 This will cause
 .Nm zfs
 to simply check that the provided key is correct.
 This command may be run even if the key is already loaded.
 .It Fl L Ar keylocation
 Use
 .Ar keylocation
 instead of the
 .Sy keylocation
 property.
 This will not change the value of the property on the dataset.
 Note that if used with either
 .Fl r
 or
 .Fl a ,
 .Ar keylocation
 may only be given as
 .Sy prompt .
 .El
 .It Xo
 .Nm zfs
 .Cm unload-key
 .Op Fl r
 .Fl a Ns | Ns Ar filesystem
 .Xc
 Unloads a key from ZFS, removing the ability to access the dataset and all of
 its children that inherit the
 .Sy keylocation
 property.
 This requires that the dataset is not currently open or mounted.
 Once the key is unloaded the
 .Sy keystatus
 property will become
 .Sy unavailable .
 .Bl -tag -width "-r"
 .It Fl r
 Recursively unloads the keys for the specified filesystem and all descendent
 encryption roots.
 .It Fl a
 Unloads the keys for all encryption roots in all imported pools.
 .El
 .It Xo
 .Nm zfs
 .Cm change-key
 .Op Fl l
 .Op Fl o Ar keylocation Ns = Ns Ar value
 .Op Fl o Ar keyformat Ns = Ns Ar value
 .Op Fl o Ar pbkdf2iters Ns = Ns Ar value
 .Ar filesystem
 .Xc
 .It Xo
 .Nm zfs
 .Cm change-key
 .Fl i
 .Op Fl l
 .Ar filesystem
 .Xc
 Changes the user's key (e.g. a passphrase) used to access a dataset.
 This command requires that the existing key for the dataset is already loaded.
 This command may also be used to change the
 .Sy keylocation ,
 .Sy keyformat ,
 and
 .Sy pbkdf2iters
 properties as needed.
 If the dataset was not previously an encryption root it will become one.
 Alternatively, the
 .Fl i
 flag may be provided to cause an encryption root to inherit the parent's key
 instead.
 .Pp
 If the user's key is compromised,
 .Nm zfs Cm change-key
 does not necessarily protect existing or newly-written data from attack.
 Newly-written data will continue to be encrypted with the same master key as
 the existing data.
 The master key is compromised if an attacker obtains a
 user key and the corresponding wrapped master key.
 Currently,
 .Nm zfs Cm change-key
 does not overwrite the previous wrapped master key on disk, so it is
 accessible via forensic analysis for an indeterminate length of time.
 .Pp
 In the event of a master key compromise, ideally the drives should be securely
 erased to remove all the old data (which is readable using the compromised
 master key), a new pool created, and the data copied back.
 This can be approximated in place by creating new datasets, copying the data
 .Pq e.g. using Nm zfs Cm send | Nm zfs Cm recv ,
 and then clearing the free space with
 .Nm zpool Cm trim Fl -secure
 if supported by your hardware, otherwise
 .Nm zpool Cm initialize .
 .Bl -tag -width "-r"
 .It Fl l
 Ensures the key is loaded before attempting to change the key.
 This is effectively equivalent to running
 .Nm zfs Cm load-key Ar filesystem ; Nm zfs Cm change-key Ar filesystem
 .It Fl o Ar property Ns = Ns Ar value
 Allows the user to set encryption key properties
 .Pq Sy keyformat , keylocation , No and Sy pbkdf2iters
 while changing the key.
 This is the only way to alter
 .Sy keyformat
 and
 .Sy pbkdf2iters
 after the dataset has been created.
 .It Fl i
 Indicates that zfs should make
 .Ar filesystem
 inherit the key of its parent.
 Note that this command can only be run on an encryption root
 that has an encrypted parent.
 .El
 .El
 .Ss Encryption
 Enabling the
 .Sy encryption
 feature allows for the creation of encrypted filesystems and volumes.
 ZFS will encrypt file and volume data, file attributes, ACLs, permission bits,
 directory listings, FUID mappings, and
 .Sy userused Ns / Ns Sy groupused
 data.
 ZFS will not encrypt metadata related to the pool structure, including
 dataset and snapshot names, dataset hierarchy, properties, file size, file
 holes, and deduplication tables (though the deduplicated data itself is
 encrypted).
 .Pp
 Key rotation is managed by ZFS.
 Changing the user's key (e.g. a passphrase)
 does not require re-encrypting the entire dataset.
 Datasets can be scrubbed,
 resilvered, renamed, and deleted without the encryption keys being loaded (see
 the
 .Cm load-key
 subcommand for more info on key loading).
 .Pp
 Creating an encrypted dataset requires specifying the
 .Sy encryption No and Sy keyformat
 properties at creation time, along with an optional
 .Sy keylocation No and Sy pbkdf2iters .
 After entering an encryption key, the
 created dataset will become an encryption root.
 Any descendant datasets will
 inherit their encryption key from the encryption root by default, meaning that
 loading, unloading, or changing the key for the encryption root will implicitly
 do the same for all inheriting datasets.
 If this inheritance is not desired, simply supply a
 .Sy keyformat
 when creating the child dataset or use
 .Nm zfs Cm change-key
 to break an existing relationship, creating a new encryption root on the child.
 Note that the child's
 .Sy keyformat
 may match that of the parent while still creating a new encryption root, and
 that changing the
 .Sy encryption
 property alone does not create a new encryption root; this would simply use a
 different cipher suite with the same key as its encryption root.
 The one exception is that clones will always use their origin's encryption key.
 As a result of this exception, some encryption-related properties
 .Pq namely Sy keystatus , keyformat ,  keylocation , No and Sy pbkdf2iters
 do not inherit like other ZFS properties and instead use the value determined
 by their encryption root.
 Encryption root inheritance can be tracked via the read-only
 .Sy encryptionroot
 property.
 .Pp
 Encryption changes the behavior of a few ZFS
 operations.
 Encryption is applied after compression so compression ratios are preserved.
 Normally checksums in ZFS are 256 bits long, but for encrypted data
 the checksum is 128 bits of the user-chosen checksum and 128 bits of MAC from
 the encryption suite, which provides additional protection against maliciously
 altered data.
 Deduplication is still possible with encryption enabled but for security,
 datasets will only deduplicate against themselves, their snapshots,
 and their clones.
 .Pp
 There are a few limitations on encrypted datasets.
 Encrypted data cannot be embedded via the
 .Sy embedded_data
 feature.
 Encrypted datasets may not have
 .Sy copies Ns = Ns Em 3
 since the implementation stores some encryption metadata where the third copy
 would normally be.
 Since compression is applied before encryption, datasets may
 be vulnerable to a CRIME-like attack if applications accessing the data allow
 for it.
 Deduplication with encryption will leak information about which blocks
 are equivalent in a dataset and will incur an extra CPU cost for each block
 written.
 .
 .Sh SEE ALSO
 .Xr zfsprops 7 ,
 .Xr zfs-create 8 ,
 .Xr zfs-set 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in b/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in
index ea470247daac..9e44ea30c636 100644
--- a/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in
+++ b/sys/contrib/openzfs/man/man8/zfs-mount-generator.8.in
@@ -1,183 +1,183 @@
 .\" SPDX-License-Identifier: MIT
 .\"
 .\" Copyright 2018 Antonio Russo <antonio.e.russo@gmail.com>
 .\" Copyright 2019 Kjeld Schouten-Lebbing <kjeld@schouten-lebbing.nl>
 .\" Copyright 2020 InsanePrawn <insane.prawny@gmail.com>
 .\"
 .\" Permission is hereby granted, free of charge, to any person obtaining
 .\" a copy of this software and associated documentation files (the
 .\" "Software"), to deal in the Software without restriction, including
 .\" without limitation the rights to use, copy, modify, merge, publish,
 .\" distribute, sublicense, and/or sell copies of the Software, and to
 .\" permit persons to whom the Software is furnished to do so, subject to
 .\" the following conditions:
 .\"
 .\" The above copyright notice and this permission notice shall be
 .\" included in all copies or substantial portions of the Software.
 .\"
 .\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 .\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 .\" MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 .\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 .\" LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 .\" OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 .\" WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 .\"
-.Dd May 31, 2021
+.Dd November 30, 2021
 .Dt ZFS-MOUNT-GENERATOR 8
 .Os
 .
 .Sh NAME
 .Nm zfs-mount-generator
 .Nd generate systemd mount units for ZFS filesystems
 .Sh SYNOPSIS
 .Pa @systemdgeneratordir@/zfs-mount-generator
 .
 .Sh DESCRIPTION
 .Nm
 is a
 .Xr systemd.generator 7
 that generates native
 .Xr systemd.mount 5
 units for configured ZFS datasets.
 .
 .Ss Properties
 .Bl -tag -compact -width "org.openzfs.systemd:required-by=unit[ unit]…"
 .It Sy mountpoint Ns =
 .No Skipped if Sy legacy No or Sy none .
 .
 .It Sy canmount Ns =
 .No Skipped if Sy off .
 .No Skipped if only Sy noauto
 datasets exist for a given mountpoint and there's more than one.
 .No Datasets with Sy yes No take precedence over ones with Sy noauto No for the same mountpoint .
 .No Sets logical Em noauto No flag if Sy noauto .
 Encryption roots always generate
 .Sy zfs-load-key@ Ns Ar root Ns Sy .service ,
 even if
 .Sy off .
 .
 .It Sy atime Ns = , Sy relatime Ns = , Sy devices Ns = , Sy exec Ns = , Sy readonly Ns = , Sy setuid Ns = , Sy nbmand Ns =
 Used to generate mount options equivalent to
 .Nm zfs Cm mount .
 .
 .It Sy encroot Ns = , Sy keylocation Ns =
 If the dataset is an encryption root, its mount unit will bind to
 .Sy zfs-load-key@ Ns Ar root Ns Sy .service ,
 with additional dependencies as follows:
 .Bl -tag -compact -offset Ds -width "keylocation=https://URL (et al.)"
 .It Sy keylocation Ns = Ns Sy prompt
 None, uses
 .Xr systemd-ask-password 1
 .It Sy keylocation Ns = Ns Sy https:// Ns Ar URL Pq et al.\&
 .Sy Wants Ns = , Sy After Ns = : Pa network-online.target
 .It Sy keylocation Ns = Ns Sy file:// Ns < Ns Ar path Ns >
 .Sy RequiresMountsFor Ns = Ns Ar path
 .El
 .
 The service also uses the same
 .Sy Wants Ns = ,
 .Sy After Ns = ,
 .Sy Requires Ns = , No and
 .Sy RequiresMountsFor Ns = ,
 as the mount unit.
 .
 .It Sy org.openzfs.systemd:requires Ns = Ns Pa path Ns Oo " " Ns Pa path Oc Ns …
 .No Sets Sy Requires Ns = for the mount- and key-loading unit.
 .
 .It Sy org.openzfs.systemd:requires-mounts-for Ns = Ns Pa path Ns Oo " " Ns Pa path Oc Ns …
 .No Sets Sy RequiresMountsFor Ns = for the mount- and key-loading unit.
 .
 .It Sy org.openzfs.systemd:before Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns …
 .No Sets Sy Before Ns = for the mount unit.
 .
 .It Sy org.openzfs.systemd:after Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns …
 .No Sets Sy After Ns = for the mount unit.
 .
 .It Sy org.openzfs.systemd:wanted-by Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns …
 .No Sets logical Em noauto No flag (see below) .
 .No If not Sy none , No sets Sy WantedBy Ns = for the mount unit.
 .It Sy org.openzfs.systemd:required-by Ns = Ns Pa unit Ns Oo " " Ns Pa unit Oc Ns …
 .No Sets logical Em noauto No flag (see below) .
 .No If not Sy none , No sets Sy RequiredBy Ns = for the mount unit.
 .
 .It Sy org.openzfs.systemd:nofail Ns = Ns (unset) Ns | Ns Sy on Ns | Ns Sy off
 Waxes or wanes strength of default reverse dependencies of the mount unit, see
 below.
 .
 .It Sy org.openzfs.systemd:ignore Ns = Ns Sy on Ns | Ns Sy off
 .No Skip if Sy on .
 .No Defaults to Sy off .
 .El
 .
 .Ss Unit Ordering And Dependencies
 Additionally, unless the pool the dataset resides on
 is imported at generation time, both units gain
 .Sy Wants Ns = Ns Pa zfs-import.target
 and
 .Sy After Ns = Ns Pa zfs-import.target .
 .Pp
 Additionally, unless the logical
 .Em noauto
 flag is set, the mount unit gains a reverse-dependency for
 .Pa local-fs.target
 of strength
 .Bl -tag -compact -offset Ds -width "(unset)"
 .It (unset)
 .Sy WantedBy Ns = No + Sy Before Ns =
 .It Sy on
 .Sy WantedBy Ns =
 .It Sy off
 .Sy RequiredBy Ns = No + Sy Before Ns =
 .El
 .
 .Ss Cache File
 Because ZFS pools may not be available very early in the boot process,
 information on ZFS mountpoints must be stored separately.
 The output of
 .Dl Nm zfs Cm list Fl Ho Ar name , Ns Aq every property above in order
 for datasets that should be mounted by systemd should be kept at
 .Pa @sysconfdir@/zfs/zfs-list.cache/ Ns Ar poolname ,
 and, if writeable, will be kept synchronized for the entire pool by the
 .Pa history_event-zfs-list-cacher.sh
 ZEDLET, if enabled
 .Pq see Xr zed 8 .
 .
 .Sh ENVIRONMENT
 If the
 .Sy ZFS_DEBUG
 environment variable is nonzero
 .Pq or unset and Pa /proc/cmdline No contains Qq Sy debug ,
 print summary accounting information at the end.
 .
 .Sh EXAMPLES
 To begin, enable tracking for the pool:
 .Dl # Nm touch Pa @sysconfdir@/zfs/zfs-list.cache/ Ns Ar poolname
 Then enable the tracking ZEDLET:
 .Dl # Nm ln Fl s Pa @zfsexecdir@/zed.d/history_event-zfs-list-cacher.sh @sysconfdir@/zfs/zed.d
 .Dl # Nm systemctl Cm enable Pa zfs-zed.service
 .Dl # Nm systemctl Cm restart Pa zfs-zed.service
 .Pp
 If no history event is in the queue,
 inject one to ensure the ZEDLET runs to refresh the cache file
 by setting a monitored property somewhere on the pool:
 .Dl # Nm zfs Cm set Sy relatime Ns = Ns Sy off Ar poolname/dset
 .Dl # Nm zfs Cm inherit Sy relatime Ar poolname/dset
 .Pp
 To test the generator output:
 .Dl $ Nm mkdir Pa /tmp/zfs-mount-generator
 .Dl $ Nm @systemdgeneratordir@/zfs-mount-generator Pa /tmp/zfs-mount-generator
 .
 If the generated units are satisfactory, instruct
 .Nm systemd
 to re-run all generators:
 .Dl # Nm systemctl daemon-reload
 .
 .Sh SEE ALSO
 .Xr systemd.mount 5 ,
 .Xr systemd.target 5 ,
 .Xr zfs 5 ,
 .Xr systemd.generator 7 ,
 .Xr systemd.special 7 ,
 .Xr zed 8 ,
 .Xr zpool-events 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-mount.8 b/sys/contrib/openzfs/man/man8/zfs-mount.8
index 9fca6fffd5bb..2689b6dc345b 100644
--- a/sys/contrib/openzfs/man/man8/zfs-mount.8
+++ b/sys/contrib/openzfs/man/man8/zfs-mount.8
@@ -1,140 +1,140 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd February 16, 2019
+.Dd October 12, 2024
 .Dt ZFS-MOUNT 8
 .Os
 .
 .Sh NAME
 .Nm zfs-mount
 .Nd manage mount state of ZFS filesystems
 .Sh SYNOPSIS
 .Nm zfs
 .Cm mount
 .Op Fl j
 .Nm zfs
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
 .Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Nm zfs
 .Cm unmount
 .Op Fl fu
 .Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm mount
 .Op Fl j
 .Xc
 Displays all ZFS file systems currently mounted.
 .Bl -tag -width "-j"
 .It Fl j , -json
 Displays all mounted file systems in JSON format.
 .El
 .It Xo
 .Nm zfs
 .Cm mount
 .Op Fl Oflv
 .Op Fl o Ar options
 .Fl a Ns | Ns Fl R Ar filesystem Ns | Ns Ar filesystem
 .Xc
 Mount ZFS filesystem on a path described by its
 .Sy mountpoint
 property, if the path exists and is empty.
 If
 .Sy mountpoint
 is set to
 .Em legacy ,
 the filesystem should be instead mounted using
 .Xr mount 8 .
 .Bl -tag -width "-O"
 .It Fl O
 Perform an overlay mount.
 Allows mounting in non-empty
 .Sy mountpoint .
 See
 .Xr mount 8
 for more information.
 .It Fl a
 Mount all available ZFS file systems.
 Invoked automatically as part of the boot process if configured.
 .It Fl R
 Mount the specified filesystems along with all their children.
 .It Ar filesystem
 Mount the specified filesystem.
 .It Fl o Ar options
 An optional, comma-separated list of mount options to use temporarily for the
 duration of the mount.
 See the
 .Em Temporary Mount Point Properties
 section of
 .Xr zfsprops 7
 for details.
 .It Fl l
 Load keys for encrypted filesystems as they are being mounted.
 This is equivalent to executing
 .Nm zfs Cm load-key
 on each encryption root before mounting it.
 Note that if a filesystem has
 .Sy keylocation Ns = Ns Sy prompt ,
 this will cause the terminal to interactively block after asking for the key.
 .It Fl v
 Report mount progress.
 .It Fl f
 Attempt to force mounting of all filesystems, even those that couldn't normally
 be mounted (e.g. redacted datasets).
 .El
 .It Xo
 .Nm zfs
 .Cm unmount
 .Op Fl fu
 .Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint
 .Xc
 Unmounts currently mounted ZFS file systems.
 .Bl -tag -width "-a"
 .It Fl a
 Unmount all available ZFS file systems.
 Invoked automatically as part of the shutdown process.
 .It Fl f
 Forcefully unmount the file system, even if it is currently in use.
 This option is not supported on Linux.
 .It Fl u
 Unload keys for any encryption roots unmounted by this command.
 .It Ar filesystem Ns | Ns Ar mountpoint
 Unmount the specified filesystem.
 The command can also be given a path to a ZFS file system mount point on the
 system.
 .El
 .El
diff --git a/sys/contrib/openzfs/man/man8/zfs-project.8 b/sys/contrib/openzfs/man/man8/zfs-project.8
index 36547680f53e..4ebfdf6ffe4f 100644
--- a/sys/contrib/openzfs/man/man8/zfs-project.8
+++ b/sys/contrib/openzfs/man/man8/zfs-project.8
@@ -1,143 +1,143 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2022
 .Dt ZFS-PROJECT 8
 .Os
 .
 .Sh NAME
 .Nm zfs-project
 .Nd manage projects in ZFS filesystem
 .Sh SYNOPSIS
 .Nm zfs
 .Cm project
 .Oo Fl d Ns | Ns Fl r Ns Oc
 .Ar file Ns | Ns Ar directory Ns …
 .Nm zfs
 .Cm project
 .Fl C
 .Oo Fl kr Ns Oc
 .Ar file Ns | Ns Ar directory Ns …
 .Nm zfs
 .Cm project
 .Fl c
 .Oo Fl 0 Ns Oc
 .Oo Fl d Ns | Ns Fl r Ns Oc
 .Op Fl p Ar id
 .Ar file Ns | Ns Ar directory Ns …
 .Nm zfs
 .Cm project
 .Op Fl p Ar id
 .Oo Fl rs Ns Oc
 .Ar file Ns | Ns Ar directory Ns …
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm project
 .Oo Fl d Ns | Ns Fl r Ns Oc
 .Ar file Ns | Ns Ar directory Ns …
 .Xc
 List project identifier (ID) and inherit flag of files and directories.
 .Bl -tag -width "-d"
 .It Fl d
 Show the directory project ID and inherit flag, not its children.
 .It Fl r
 List subdirectories recursively.
 .El
 .It Xo
 .Nm zfs
 .Cm project
 .Fl C
 .Oo Fl kr Ns Oc
 .Ar file Ns | Ns Ar directory Ns …
 .Xc
 Clear project inherit flag and/or ID on the files and directories.
 .Bl -tag -width "-k"
 .It Fl k
 Keep the project ID unchanged.
 If not specified, the project ID will be reset to zero.
 .It Fl r
 Clear subdirectories' flags recursively.
 .El
 .It Xo
 .Nm zfs
 .Cm project
 .Fl c
 .Oo Fl 0 Ns Oc
 .Oo Fl d Ns | Ns Fl r Ns Oc
 .Op Fl p Ar id
 .Ar file Ns | Ns Ar directory Ns …
 .Xc
 Check project ID and inherit flag on the files and directories:
 report entries without the project inherit flag, or with project IDs different
 from the
 target directory's project ID or the one specified with
 .Fl p .
 .Bl -tag -width "-p id"
 .It Fl 0
 Delimit filenames with a NUL byte instead of newline, don't output diagnoses.
 .It Fl d
 Check the directory project ID and inherit flag, not its children.
 .It Fl p Ar id
 Compare to
 .Ar id
 instead of the target files and directories' project IDs.
 .It Fl r
 Check subdirectories recursively.
 .El
 .It Xo
 .Nm zfs
 .Cm project
 .Fl p Ar id
 .Oo Fl rs Ns Oc
 .Ar file Ns | Ns Ar directory Ns …
 .Xc
 Set project ID and/or inherit flag on the files and directories.
 .Bl -tag -width "-p id"
 .It Fl p Ar id
 Set the project ID to the given value.
 .It Fl r
 Set on subdirectories recursively.
 .It Fl s
 Set project inherit flag on the given files and directories.
 This is usually used for setting up tree quotas with
 .Fl r .
 In that case, the directory's project ID
 will be set for all its descendants, unless specified explicitly with
 .Fl p .
 .El
 .El
 .
 .Sh SEE ALSO
 .Xr zfs-projectspace 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-promote.8 b/sys/contrib/openzfs/man/man8/zfs-promote.8
index 767045812607..435a7a5d0144 100644
--- a/sys/contrib/openzfs/man/man8/zfs-promote.8
+++ b/sys/contrib/openzfs/man/man8/zfs-promote.8
@@ -1,86 +1,86 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZFS-PROMOTE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-promote
 .Nd promote clone dataset to no longer depend on origin snapshot
 .Sh SYNOPSIS
 .Nm zfs
 .Cm promote
 .Ar clone
 .
 .Sh DESCRIPTION
 The
 .Nm zfs Cm promote
 command makes it possible to destroy the dataset that the clone was created
 from.
 The clone parent-child dependency relationship is reversed, so that the origin
 dataset becomes a clone of the specified dataset.
 .Pp
 The snapshot that was cloned, and any snapshots previous to this snapshot, are
 now owned by the promoted clone.
 The space they use moves from the origin dataset to the promoted clone, so
 enough space must be available to accommodate these snapshots.
 No new space is consumed by this operation, but the space accounting is
 adjusted.
 The promoted clone must not have any conflicting snapshot names of its own.
 The
 .Nm zfs Cm rename
 subcommand can be used to rename any conflicting snapshots.
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 10 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-clone 8 ,
 .Xr zfs-rename 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-rename.8 b/sys/contrib/openzfs/man/man8/zfs-rename.8
index 4cf192c0682b..8fedc67469e6 100644
--- a/sys/contrib/openzfs/man/man8/zfs-rename.8
+++ b/sys/contrib/openzfs/man/man8/zfs-rename.8
@@ -1,161 +1,161 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZFS-RENAME 8
 .Os
 .
 .Sh NAME
 .Nm zfs-rename
 .Nd rename ZFS dataset
 .Sh SYNOPSIS
 .Nm zfs
 .Cm rename
 .Op Fl f
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Nm zfs
 .Cm rename
 .Fl p
 .Op Fl f
 .Ar filesystem Ns | Ns Ar volume
 .Ar filesystem Ns | Ns Ar volume
 .Nm zfs
 .Cm rename
 .Fl u
 .Op Fl f
 .Ar filesystem Ar filesystem
 .Nm zfs
 .Cm rename
 .Fl r
 .Ar snapshot Ar snapshot
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm rename
 .Op Fl f
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Xc
 .It Xo
 .Nm zfs
 .Cm rename
 .Fl p
 .Op Fl f
 .Ar filesystem Ns | Ns Ar volume
 .Ar filesystem Ns | Ns Ar volume
 .Xc
 .It Xo
 .Nm zfs
 .Cm rename
 .Fl u
 .Op Fl f
 .Ar filesystem
 .Ar filesystem
 .Xc
 Renames the given dataset.
 The new target can be located anywhere in the ZFS hierarchy, with the exception
 of snapshots.
 Snapshots can only be renamed within the parent file system or volume.
 When renaming a snapshot, the parent file system of the snapshot does not need
 to be specified as part of the second argument.
 Renamed file systems can inherit new mount points, in which case they are
 unmounted and remounted at the new mount point.
 .Bl -tag -width "-a"
 .It Fl f
 Force unmount any file systems that need to be unmounted in the process.
 This flag has no effect if used together with the
 .Fl u
 flag.
 .It Fl p
 Creates all the nonexistent parent datasets.
 Datasets created in this manner are automatically mounted according to the
 .Sy mountpoint
 property inherited from their parent.
 .It Fl u
 Do not remount file systems during rename.
 If a file system's
 .Sy mountpoint
 property is set to
 .Sy legacy
 or
 .Sy none ,
 the file system is not unmounted even if this option is not given.
 .El
 .It Xo
 .Nm zfs
 .Cm rename
 .Fl r
 .Ar snapshot Ar snapshot
 .Xc
 Recursively rename the snapshots of all descendent datasets.
 Snapshots are the only dataset that can be renamed recursively.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 10, 15 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Ss Example 2 : No Performing a Rolling Snapshot
 The following example shows how to maintain a history of snapshots with a
 consistent naming scheme.
 To keep a week's worth of snapshots, the user destroys the oldest snapshot,
 renames the remaining snapshots, and then creates a new snapshot, as follows:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm destroy Fl r Ar pool/users@7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@6daysago No @ Ns Ar 7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@5daysago No @ Ns Ar 6daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@4daysago No @ Ns Ar 5daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@3daysago No @ Ns Ar 4daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@2daysago No @ Ns Ar 3daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@yesterday No @ Ns Ar 2daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@today No @ Ns Ar yesterday
 .No # Nm zfs Cm snapshot Fl r Ar pool/users Ns @ Ns Ar today
 .Ed
diff --git a/sys/contrib/openzfs/man/man8/zfs-rewrite.8 b/sys/contrib/openzfs/man/man8/zfs-rewrite.8
index a3a037f3794a..ca5340c7e5eb 100644
--- a/sys/contrib/openzfs/man/man8/zfs-rewrite.8
+++ b/sys/contrib/openzfs/man/man8/zfs-rewrite.8
@@ -1,90 +1,90 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2025 iXsystems, Inc.
 .\"
-.Dd May 6, 2025
+.Dd July 23, 2025
 .Dt ZFS-REWRITE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-rewrite
 .Nd rewrite specified files without modification
 .Sh SYNOPSIS
 .Nm zfs
 .Cm rewrite
 .Oo Fl Prvx Ns Oc
 .Op Fl l Ar length
 .Op Fl o Ar offset
 .Ar file Ns | Ns Ar directory Ns …
 .
 .Sh DESCRIPTION
 Rewrite blocks of specified
 .Ar file
 as is without modification at a new location and possibly with new
 properties, such as checksum, compression, dedup, copies, etc,
 as if they were atomically read and written back.
 .Bl -tag -width "-r"
 .It Fl P
 Perform physical rewrite, preserving logical birth time of blocks.
 By default, rewrite updates logical birth times, making blocks appear
 as modified in snapshots and incremental send streams.
 Physical rewrite preserves logical birth times, avoiding unnecessary
 inclusion in incremental streams.
 Physical rewrite requires the
 .Sy physical_rewrite
 feature to be enabled on the pool.
 .It Fl l Ar length
 Rewrite at most this number of bytes.
 .It Fl o Ar offset
 Start at this offset in bytes.
 .It Fl r
 Recurse into directories.
 .It Fl v
 Print names of all successfully rewritten files.
 .It Fl x
 Don't cross file system mount points when recursing.
 .El
 .Sh NOTES
 Rewrite of cloned blocks and blocks that are part of any snapshots,
 same as some property changes may increase pool space usage.
 Holes that were never written or were previously zero-compressed are
 not rewritten and will remain holes even if compression is disabled.
 .Pp
 If a
 .Fl l
 or
 .Fl o
 value request a rewrite to regions past the end of the file, then those
 regions are silently ignored, and no error is reported.
 .Pp
 By default, rewritten blocks update their logical birth time,
 meaning they will be included in incremental
 .Nm zfs Cm send
 streams as modified data.
 When the
 .Fl P
 flag is used, rewritten blocks preserve their logical birth time, since
 there are no user data changes.
 .
 .Sh SEE ALSO
 .Xr zfsprops 7 ,
 .Xr zpool-features 7
diff --git a/sys/contrib/openzfs/man/man8/zfs-send.8 b/sys/contrib/openzfs/man/man8/zfs-send.8
index f7c6b840303c..6c5f6b94afd5 100644
--- a/sys/contrib/openzfs/man/man8/zfs-send.8
+++ b/sys/contrib/openzfs/man/man8/zfs-send.8
@@ -1,747 +1,747 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\" Copyright (c) 2024, Klara, Inc.
 .\"
-.Dd October 2, 2024
+.Dd August 29, 2025
 .Dt ZFS-SEND 8
 .Os
 .
 .Sh NAME
 .Nm zfs-send
 .Nd generate backup stream of ZFS dataset
 .Sh SYNOPSIS
 .Nm zfs
 .Cm send
 .Op Fl DLPVbcehnpsvw
 .Op Fl R Op Fl X Ar dataset Ns Oo , Ns Ar dataset Oc Ns …
 .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot
 .Ar snapshot
 .Nm zfs
 .Cm send
 .Op Fl DLPVcensvw
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Nm zfs
 .Cm send
 .Fl -redact Ar redaction_bookmark
 .Op Fl DLPVcenpv
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar snapshot
 .Nm zfs
 .Cm send
 .Op Fl PVenv
 .Fl t
 .Ar receive_resume_token
 .Nm zfs
 .Cm send
 .Op Fl PVnv
 .Fl S Ar filesystem
 .Nm zfs
 .Cm redact
 .Ar snapshot redaction_bookmark
 .Ar redaction_snapshot Ns …
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm send
 .Op Fl DLPVbcehnpsvw
 .Op Fl R Op Fl X Ar dataset Ns Oo , Ns Ar dataset Oc Ns …
 .Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot
 .Ar snapshot
 .Xc
 Creates a stream representation of the second
 .Ar snapshot ,
 which is written to standard output.
 The output can be redirected to a file or to a different system
 .Po for example, using
 .Xr ssh 1
 .Pc .
 By default, a full stream is generated.
 .Bl -tag -width "-D"
 .It Fl D , -dedup
 Deduplicated send is no longer supported.
 This flag is accepted for backwards compatibility, but a regular,
 non-deduplicated stream will be generated.
 .It Fl I Ar snapshot
 Generate a stream package that sends all intermediary snapshots from the first
 snapshot to the second snapshot.
 For example,
 .Fl I Em @a Em fs@d
 is similar to
 .Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d .
 The incremental source may be specified as with the
 .Fl i
 option.
 .It Fl L , -large-block
 Generate a stream which may contain blocks larger than 128 KiB.
 This flag has no effect if the
 .Sy large_blocks
 pool feature is disabled, or if the
 .Sy recordsize
 property of this filesystem has never been set above 128 KiB.
 The receiving system must have the
 .Sy large_blocks
 pool feature enabled as well.
 This flag is required if the
 .Sy large_microzap
 pool feature is active.
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy large_blocks
 feature.
 .It Fl P , -parsable
 Print machine-parsable verbose information about the stream package generated.
 .It Fl R , -replicate
 Generate a replication stream package, which will replicate the specified
 file system, and all descendent file systems, up to the named snapshot.
 When received, all properties, snapshots, descendent file systems, and clones
 are preserved.
 .Pp
 If the
 .Fl i
 or
 .Fl I
 flags are used in conjunction with the
 .Fl R
 flag, an incremental replication stream is generated.
 The current values of properties, and current snapshot and file system names are
 set when the stream is received.
 If the
 .Fl F
 flag is specified when this stream is received, snapshots and file systems that
 do not exist on the sending side are destroyed.
 If the
 .Fl R
 flag is used to send encrypted datasets, then
 .Fl w
 must also be specified.
 .It Fl V , -proctitle
 Set the process title to a per-second report of how much data has been sent.
 .It Fl X , -exclude Ar dataset Ns Oo , Ns Ar dataset Oc Ns …
 With
 .Fl R ,
 .Fl X
 specifies a set of datasets (and, hence, their descendants),
 to be excluded from the send stream.
 The root dataset may not be excluded.
 .Fl X Ar a Fl X Ar b
 is equivalent to
 .Fl X Ar a , Ns Ar b .
 .It Fl e , -embed
 Generate a more compact stream by using
 .Sy WRITE_EMBEDDED
 records for blocks which are stored more compactly on disk by the
 .Sy embedded_data
 pool feature.
 This flag has no effect if the
 .Sy embedded_data
 feature is disabled.
 The receiving system must have the
 .Sy embedded_data
 feature enabled.
 If the
 .Sy lz4_compress
 or
 .Sy zstd_compress
 features are active on the sending system, then the receiving system must have
 the corresponding features enabled as well.
 Datasets that are sent with this flag may not be
 received as an encrypted dataset, since encrypted datasets cannot use the
 .Sy embedded_data
 feature.
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy embedded_data
 feature.
 .It Fl b , -backup
 Sends only received property values whether or not they are overridden by local
 settings, but only if the dataset has ever been received.
 Use this option when you want
 .Nm zfs Cm receive
 to restore received properties backed up on the sent dataset and to avoid
 sending local settings that may have nothing to do with the source dataset,
 but only with how the data is backed up.
 .It Fl c , -compressed
 Generate a more compact stream by using compressed WRITE records for blocks
 which are compressed on disk and in memory
 .Po see the
 .Sy compression
 property for details
 .Pc .
 If the
 .Sy lz4_compress
 or
 .Sy zstd_compress
 features are active on the sending system, then the receiving system must have
 the corresponding features enabled as well.
 If the
 .Sy large_blocks
 feature is enabled on the sending system but the
 .Fl L
 option is not supplied in conjunction with
 .Fl c ,
 then the data will be decompressed before sending so it can be split into
 smaller block sizes.
 Streams sent with
 .Fl c
 will not have their data recompressed on the receiver side using
 .Fl o Sy compress Ns = Ar value .
 The data will stay compressed as it was from the sender.
 The new compression property will be set for future data.
 Note that uncompressed data from the sender will still attempt to
 compress on the receiver, unless you specify
 .Fl o Sy compress Ns = Em off .
 .It Fl w , -raw
 For encrypted datasets, send data exactly as it exists on disk.
 This allows backups to be taken even if encryption keys are not currently
 loaded.
 The backup may then be received on an untrusted machine since that machine will
 not have the encryption keys to read the protected data or alter it without
 being detected.
 Upon being received, the dataset will have the same encryption
 keys as it did on the send side, although the
 .Sy keylocation
 property will be defaulted to
 .Sy prompt
 if not otherwise provided.
 For unencrypted datasets, this flag will be equivalent to
 .Fl Lec .
 Note that if you do not use this flag for sending encrypted datasets, data will
 be sent unencrypted and may be re-encrypted with a different encryption key on
 the receiving system, which will disable the ability to do a raw send to that
 system for incrementals.
 .It Fl h , -holds
 Generate a stream package that includes any snapshot holds (created with the
 .Nm zfs Cm hold
 command), and indicating to
 .Nm zfs Cm receive
 that the holds be applied to the dataset on the receiving system.
 .It Fl i Ar snapshot
 Generate an incremental stream from the first
 .Ar snapshot
 .Pq the incremental source
 to the second
 .Ar snapshot
 .Pq the incremental target .
 The incremental source can be specified as the last component of the snapshot
 name
 .Po the
 .Sy @
 character and following
 .Pc
 and it is assumed to be from the same file system as the incremental target.
 .Pp
 If the destination is a clone, the source may be the origin snapshot, which must
 be fully specified
 .Po for example,
 .Em pool/fs@origin ,
 not just
 .Em @origin
 .Pc .
 .It Fl n , -dryrun
 Do a dry-run
 .Pq Qq No-op
 send.
 Do not generate any actual send data.
 This is useful in conjunction with the
 .Fl v
 or
 .Fl P
 flags to determine what data will be sent.
 In this case, the verbose output will be written to standard output
 .Po contrast with a non-dry-run, where the stream is written to standard output
 and the verbose output goes to standard error
 .Pc .
 .It Fl p , -props
 Include the dataset's properties in the stream.
 This flag is implicit when
 .Fl R
 is specified.
 The receiving system must also support this feature.
 Sends of encrypted datasets must use
 .Fl w
 when using this flag.
 .It Fl s , -skip-missing
 Allows sending a replication stream even when there are snapshots missing in the
 hierarchy.
 When a snapshot is missing, instead of throwing an error and aborting the send,
 a warning is printed to the standard error stream and the dataset to which it
 belongs
 and its descendants are skipped.
 This flag can only be used in conjunction with
 .Fl R .
 .It Fl v , -verbose
 Print verbose information about the stream package generated.
 This information includes a per-second report of how much data has been sent.
 The same report can be requested by sending
 .Dv SIGINFO
 or
 .Dv SIGUSR1 ,
 regardless of
 .Fl v .
 .Pp
 The format of the stream is committed.
 You will be able to receive your streams on future versions of ZFS.
 .El
 .It Xo
 .Nm zfs
 .Cm send
 .Op Fl DLPVcenvw
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot
 .Xc
 Generate a send stream, which may be of a filesystem, and may be incremental
 from a bookmark.
 If the destination is a filesystem or volume, the pool must be read-only, or the
 filesystem must not be mounted.
 When the stream generated from a filesystem or volume is received, the default
 snapshot name will be
 .Qq --head-- .
 .Bl -tag -width "-D"
 .It Fl D , -dedup
 Deduplicated send is no longer supported.
 This flag is accepted for backwards compatibility, but a regular,
 non-deduplicated stream will be generated.
 .It Fl L , -large-block
 Generate a stream which may contain blocks larger than 128 KiB.
 This flag has no effect if the
 .Sy large_blocks
 pool feature is disabled, or if the
 .Sy recordsize
 property of this filesystem has never been set above 128 KiB.
 The receiving system must have the
 .Sy large_blocks
 pool feature enabled as well.
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy large_blocks
 feature.
 .It Fl P , -parsable
 Print machine-parsable verbose information about the stream package generated.
 .It Fl c , -compressed
 Generate a more compact stream by using compressed WRITE records for blocks
 which are compressed on disk and in memory
 .Po see the
 .Sy compression
 property for details
 .Pc .
 If the
 .Sy lz4_compress
 or
 .Sy zstd_compress
 features are active on the sending system, then the receiving system must have
 the corresponding features enabled as well.
 If the
 .Sy large_blocks
 feature is enabled on the sending system but the
 .Fl L
 option is not supplied in conjunction with
 .Fl c ,
 then the data will be decompressed before sending so it can be split into
 smaller block sizes.
 .It Fl w , -raw
 For encrypted datasets, send data exactly as it exists on disk.
 This allows backups to be taken even if encryption keys are not currently
 loaded.
 The backup may then be received on an untrusted machine since that machine will
 not have the encryption keys to read the protected data or alter it without
 being detected.
 Upon being received, the dataset will have the same encryption
 keys as it did on the send side, although the
 .Sy keylocation
 property will be defaulted to
 .Sy prompt
 if not otherwise provided.
 For unencrypted datasets, this flag will be equivalent to
 .Fl Lec .
 Note that if you do not use this flag for sending encrypted datasets, data will
 be sent unencrypted and may be re-encrypted with a different encryption key on
 the receiving system, which will disable the ability to do a raw send to that
 system for incrementals.
 .It Fl e , -embed
 Generate a more compact stream by using
 .Sy WRITE_EMBEDDED
 records for blocks which are stored more compactly on disk by the
 .Sy embedded_data
 pool feature.
 This flag has no effect if the
 .Sy embedded_data
 feature is disabled.
 The receiving system must have the
 .Sy embedded_data
 feature enabled.
 If the
 .Sy lz4_compress
 or
 .Sy zstd_compress
 features are active on the sending system, then the receiving system must have
 the corresponding features enabled as well.
 Datasets that are sent with this flag may not be received as an encrypted
 dataset,
 since encrypted datasets cannot use the
 .Sy embedded_data
 feature.
 See
 .Xr zpool-features 7
 for details on ZFS feature flags and the
 .Sy embedded_data
 feature.
 .It Fl i Ar snapshot Ns | Ns Ar bookmark
 Generate an incremental send stream.
 The incremental source must be an earlier snapshot in the destination's history.
 It will commonly be an earlier snapshot in the destination's file system, in
 which case it can be specified as the last component of the name
 .Po the
 .Sy #
 or
 .Sy @
 character and following
 .Pc .
 .Pp
 If the incremental target is a clone, the incremental source can be the origin
 snapshot, or an earlier snapshot in the origin's filesystem, or the origin's
 origin, etc.
 .It Fl n , -dryrun
 Do a dry-run
 .Pq Qq No-op
 send.
 Do not generate any actual send data.
 This is useful in conjunction with the
 .Fl v
 or
 .Fl P
 flags to determine what data will be sent.
 In this case, the verbose output will be written to standard output
 .Po contrast with a non-dry-run, where the stream is written to standard output
 and the verbose output goes to standard error
 .Pc .
 .It Fl v , -verbose
 Print verbose information about the stream package generated.
 This information includes a per-second report of how much data has been sent.
 The same report can be requested by sending
 .Dv SIGINFO
 or
 .Dv SIGUSR1 ,
 regardless of
 .Fl v .
 .El
 .It Xo
 .Nm zfs
 .Cm send
 .Fl -redact Ar redaction_bookmark
 .Op Fl DLPVcenpv
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Ar snapshot
 .Xc
 Generate a redacted send stream.
 This send stream contains all blocks from the snapshot being sent that aren't
 included in the redaction list contained in the bookmark specified by the
 .Fl -redact
 (or
 .Fl d )
 flag.
 The resulting send stream is said to be redacted with respect to the snapshots
 the bookmark specified by the
 .Fl -redact No flag was created with .
 The bookmark must have been created by running
 .Nm zfs Cm redact
 on the snapshot being sent.
 .Pp
 This feature can be used to allow clones of a filesystem to be made available on
 a remote system, in the case where their parent need not (or needs to not) be
 usable.
 For example, if a filesystem contains sensitive data, and it has clones where
 that sensitive data has been secured or replaced with dummy data, redacted sends
 can be used to replicate the secured data without replicating the original
 sensitive data, while still sharing all possible blocks.
 A snapshot that has been redacted with respect to a set of snapshots will
 contain all blocks referenced by at least one snapshot in the set, but will
 contain none of the blocks referenced by none of the snapshots in the set.
 In other words, if all snapshots in the set have modified a given block in the
 parent, that block will not be sent; but if one or more snapshots have not
 modified a block in the parent, they will still reference the parent's block, so
 that block will be sent.
 Note that only user data will be redacted.
 .Pp
 When the redacted send stream is received, we will generate a redacted
 snapshot.
 Due to the nature of redaction, a redacted dataset can only be used in the
 following ways:
 .Bl -enum -width "a."
 .It
 To receive, as a clone, an incremental send from the original snapshot to one
 of the snapshots it was redacted with respect to.
 In this case, the stream will produce a valid dataset when received because all
 blocks that were redacted in the parent are guaranteed to be present in the
 child's send stream.
 This use case will produce a normal snapshot, which can be used just like other
 snapshots.
 .
 .It
 To receive an incremental send from the original snapshot to something
 redacted with respect to a subset of the set of snapshots the initial snapshot
 was redacted with respect to.
 In this case, each block that was redacted in the original is still redacted
 (redacting with respect to additional snapshots causes less data to be redacted
 (because the snapshots define what is permitted, and everything else is
 redacted)).
 This use case will produce a new redacted snapshot.
 .It
 To receive an incremental send from a redaction bookmark of the original
 snapshot that was created when redacting with respect to a subset of the set of
 snapshots the initial snapshot was created with respect to
 anything else.
 A send stream from such a redaction bookmark will contain all of the blocks
 necessary to fill in any redacted data, should it be needed, because the sending
 system is aware of what blocks were originally redacted.
 This will either produce a normal snapshot or a redacted one, depending on
 whether the new send stream is redacted.
 .It
 To receive an incremental send from a redacted version of the initial
 snapshot that is redacted with respect to a subject of the set of snapshots the
 initial snapshot was created with respect to.
 A send stream from a compatible redacted dataset will contain all of the blocks
 necessary to fill in any redacted data.
 This will either produce a normal snapshot or a redacted one, depending on
 whether the new send stream is redacted.
 .It
 To receive a full send as a clone of the redacted snapshot.
 Since the stream is a full send, it definitionally contains all the data needed
 to create a new dataset.
 This use case will either produce a normal snapshot or a redacted one, depending
 on whether the full send stream was redacted.
 .El
 .Pp
 These restrictions are detected and enforced by
 .Nm zfs Cm receive ;
 a redacted send stream will contain the list of snapshots that the stream is
 redacted with respect to.
 These are stored with the redacted snapshot, and are used to detect and
 correctly handle the cases above.
 Note that for technical reasons,
 raw sends and redacted sends cannot be combined at this time.
 .It Xo
 .Nm zfs
 .Cm send
 .Op Fl PVenv
 .Fl t
 .Ar receive_resume_token
 .Xc
 Creates a send stream which resumes an interrupted receive.
 The
 .Ar receive_resume_token
 is the value of this property on the filesystem or volume that was being
 received into.
 See the documentation for
 .Nm zfs Cm receive Fl s
 for more details.
 .It Xo
 .Nm zfs
 .Cm send
 .Op Fl PVnv
 .Op Fl i Ar snapshot Ns | Ns Ar bookmark
 .Fl S
 .Ar filesystem
 .Xc
 Generate a send stream from a dataset that has been partially received.
 .Bl -tag -width "-L"
 .It Fl S , -saved
 This flag requires that the specified filesystem previously received a resumable
 send that did not finish and was interrupted.
 In such scenarios this flag
 enables the user to send this partially received state.
 Using this flag will always use the last fully received snapshot
 as the incremental source if it exists.
 .El
 .It Xo
 .Nm zfs
 .Cm redact
 .Ar snapshot redaction_bookmark
 .Ar redaction_snapshot Ns …
 .Xc
 Generate a new redaction bookmark.
 In addition to the typical bookmark information, a redaction bookmark contains
 the list of redacted blocks and the list of redaction snapshots specified.
 The redacted blocks are blocks in the snapshot which are not referenced by any
 of the redaction snapshots.
 These blocks are found by iterating over the metadata in each redaction snapshot
 to determine what has been changed since the target snapshot.
 Redaction is designed to support redacted zfs sends; see the entry for
 .Nm zfs Cm send
 for more information on the purpose of this operation.
 If a redact operation fails partway through (due to an error or a system
 failure), the redaction can be resumed by rerunning the same command.
 .El
 .Ss Redaction
 ZFS has support for a limited version of data subsetting, in the form of
 redaction.
 Using the
 .Nm zfs Cm redact
 command, a
 .Sy redaction bookmark
 can be created that stores a list of blocks containing sensitive information.
 When provided to
 .Nm zfs Cm send ,
 this causes a
 .Sy redacted send
 to occur.
 Redacted sends omit the blocks containing sensitive information,
 replacing them with REDACT records.
 When these send streams are received, a
 .Sy redacted dataset
 is created.
 A redacted dataset cannot be mounted by default, since it is incomplete.
 It can be used to receive other send streams.
 In this way datasets can be used for data backup and replication,
 with all the benefits that zfs send and receive have to offer,
 while protecting sensitive information from being
 stored on less-trusted machines or services.
 .Pp
 For the purposes of redaction, there are two steps to the process.
 A redact step, and a send/receive step.
 First, a redaction bookmark is created.
 This is done by providing the
 .Nm zfs Cm redact
 command with a parent snapshot, a bookmark to be created, and a number of
 redaction snapshots.
 These redaction snapshots must be descendants of the parent snapshot,
 and they should modify data that is considered sensitive in some way.
 Any blocks of data modified by all of the redaction snapshots will
 be listed in the redaction bookmark, because it represents the truly sensitive
 information.
 When it comes to the send step, the send process will not send
 the blocks listed in the redaction bookmark, instead replacing them with
 REDACT records.
 When received on the target system, this will create a
 redacted dataset, missing the data that corresponds to the blocks in the
 redaction bookmark on the sending system.
 The incremental send streams from
 the original parent to the redaction snapshots can then also be received on
 the target system, and this will produce a complete snapshot that can be used
 normally.
 Incrementals from one snapshot on the parent filesystem and another
 can also be done by sending from the redaction bookmark, rather than the
 snapshots themselves.
 .Pp
 In order to make the purpose of the feature more clear, an example is provided.
 Consider a zfs filesystem containing four files.
 These files represent information for an online shopping service.
 One file contains a list of usernames and passwords, another contains purchase
 histories,
 a third contains click tracking data, and a fourth contains user preferences.
 The owner of this data wants to make it available for their development teams to
 test against, and their market research teams to do analysis on.
 The development teams need information about user preferences and the click
 tracking data, while the market research teams need information about purchase
 histories and user preferences.
 Neither needs access to the usernames and passwords.
 However, because all of this data is stored in one ZFS filesystem,
 it must all be sent and received together.
 In addition, the owner of the data
 wants to take advantage of features like compression, checksumming, and
 snapshots, so they do want to continue to use ZFS to store and transmit their
 data.
 Redaction can help them do so.
 First, they would make two clones of a snapshot of the data on the source.
 In one clone, they create the setup they want their market research team to see;
 they delete the usernames and passwords file,
 and overwrite the click tracking data with dummy information.
 In another, they create the setup they want the development teams
 to see, by replacing the passwords with fake information and replacing the
 purchase histories with randomly generated ones.
 They would then create a redaction bookmark on the parent snapshot,
 using snapshots on the two clones as redaction snapshots.
 The parent can then be sent, redacted, to the target
 server where the research and development teams have access.
 Finally, incremental sends from the parent snapshot to each of the clones can be
 sent
 to and received on the target server; these snapshots are identical to the
 ones on the source, and are ready to be used, while the parent snapshot on the
 target contains none of the username and password data present on the source,
 because it was removed by the redacted send operation.
 .
 .Sh SIGNALS
 See
 .Fl v .
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 12, 13 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Remotely Replicating ZFS Data
 The following commands send a full stream and then an incremental stream to a
 remote machine, restoring them into
 .Em poolB/received/fs@a
 and
 .Em poolB/received/fs@b ,
 respectively.
 .Em poolB
 must contain the file system
 .Em poolB/received ,
 and must not initially contain
 .Em poolB/received/fs .
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm send Ar pool/fs@a |
 .No "   " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs Ns @ Ns Ar a
 .No # Nm zfs Cm send Fl i Ar a pool/fs@b |
 .No "   " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs
 .Ed
 .
 .Ss Example 2 : No Using the Nm zfs Cm receive Fl d No Option
 The following command sends a full stream of
 .Ar poolA/fsA/fsB@snap
 to a remote machine, receiving it into
 .Ar poolB/received/fsA/fsB@snap .
 The
 .Ar fsA/fsB@snap
 portion of the received snapshot's name is determined from the name of the sent
 snapshot.
 .Ar poolB
 must contain the file system
 .Ar poolB/received .
 If
 .Ar poolB/received/fsA
 does not exist, it is created as an empty file system.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm send Ar poolA/fsA/fsB@snap |
 .No "   " Nm ssh Ar host Nm zfs Cm receive Fl d Ar poolB/received
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-bookmark 8 ,
 .Xr zfs-receive 8 ,
 .Xr zfs-redact 8 ,
 .Xr zfs-snapshot 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-set.8 b/sys/contrib/openzfs/man/man8/zfs-set.8
index 67f4d6eba171..08daf09d05f8 100644
--- a/sys/contrib/openzfs/man/man8/zfs-set.8
+++ b/sys/contrib/openzfs/man/man8/zfs-set.8
@@ -1,377 +1,377 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd April 20, 2024
+.Dd October 12, 2024
 .Dt ZFS-SET 8
 .Os
 .
 .Sh NAME
 .Nm zfs-set
 .Nd set properties on ZFS datasets
 .Sh SYNOPSIS
 .Nm zfs
 .Cm set
 .Op Fl u
 .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns …
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns …
 .Nm zfs
 .Cm get
 .Op Fl r Ns | Ns Fl d Ar depth
 .Op Fl Hp
 .Op Fl j Op Ar --json-int
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns … Oc
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Cm all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns …
 .Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns …
 .Nm zfs
 .Cm inherit
 .Op Fl rS
 .Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns …
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm set
 .Op Fl u
 .Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns …
 .Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns …
 .Xc
 Only some properties can be edited.
 See
 .Xr zfsprops 7
 for more information on what properties can be set and acceptable
 values.
 Numeric values can be specified as exact values, or in a human-readable form
 with a suffix of
 .Sy B , K , M , G , T , P , E , Z
 .Po for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes,
 or zettabytes, respectively
 .Pc .
 User properties can be set on snapshots.
 For more information, see the
 .Em User Properties
 section of
 .Xr zfsprops 7 .
 .Bl -tag -width "-u"
 .It Fl u
 Update mountpoint, sharenfs, sharesmb property but do not mount or share the
 dataset.
 .El
 .It Xo
 .Nm zfs
 .Cm get
 .Op Fl r Ns | Ns Fl d Ar depth
 .Op Fl Hp
 .Op Fl j Op Ar --json-int
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns … Oc
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Cm all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns …
 .Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns …
 .Xc
 Displays properties for the given datasets.
 If no datasets are specified, then the command displays properties for all
 datasets on the system.
 For each property, the following columns are displayed:
 .Bl -tag -compact -offset 4n -width "property"
 .It Sy name
 Dataset name
 .It Sy property
 Property name
 .It Sy value
 Property value
 .It Sy source
 Property source
 .Sy local , default , inherited , temporary , received , No or Sy - Pq none .
 .El
 .Pp
 All columns are displayed by default, though this can be controlled by using the
 .Fl o
 option.
 This command takes a comma-separated list of properties as described in the
 .Sx Native Properties
 and
 .Sx User Properties
 sections of
 .Xr zfsprops 7 .
 .Pp
 The value
 .Sy all
 can be used to display all properties that apply to the given dataset's type
 .Pq Sy filesystem , volume , snapshot , No or Sy bookmark .
 .Bl -tag -width "-s source"
 .It Fl j , -json Op Ar --json-int
 Display the output in JSON format.
 Specify
 .Sy --json-int
 to display numbers in integer format instead of strings for JSON output.
 .It Fl H
 Display output in a form more easily parsed by scripts.
 Any headers are omitted, and fields are explicitly separated by a single tab
 instead of an arbitrary amount of space.
 .It Fl d Ar depth
 Recursively display any children of the dataset, limiting the recursion to
 .Ar depth .
 A depth of
 .Sy 1
 will display only the dataset and its direct children.
 .It Fl o Ar field
 A comma-separated list of columns to display, defaults to
 .Sy name , Ns Sy property , Ns Sy value , Ns Sy source .
 .It Fl p
 Display numbers in parsable
 .Pq exact
 values.
 .It Fl r
 Recursively display properties for any children.
 .It Fl s Ar source
 A comma-separated list of sources to display.
 Those properties coming from a source other than those in this list are ignored.
 Each source must be one of the following:
 .Sy local , default , inherited , temporary , received , No or Sy none .
 The default value is all sources.
 .It Fl t Ar type
 A comma-separated list of types to display, where
 .Ar type
 is one of
 .Sy filesystem , snapshot , volume , bookmark , No or Sy all .
 .Sy fs ,
 .Sy snap ,
 or
 .Sy vol
 can be used as aliases for
 .Sy filesystem ,
 .Sy snapshot ,
 or
 .Sy volume .
 .El
 .It Xo
 .Nm zfs
 .Cm inherit
 .Op Fl rS
 .Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns …
 .Xc
 Clears the specified property, causing it to be inherited from an ancestor,
 restored to default if no ancestor has the property set, or with the
 .Fl S
 option reverted to the received value if one exists.
 See
 .Xr zfsprops 7
 for a listing of default values, and details on which properties can be
 inherited.
 .Bl -tag -width "-r"
 .It Fl r
 Recursively inherit the given property for all children.
 .It Fl S
 Revert the property to the received value, if one exists;
 otherwise, for non-inheritable properties, to the default;
 otherwise, operate as if the
 .Fl S
 option was not specified.
 .El
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 1, 4, 6, 7, 11, 14, 16 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating a ZFS File System Hierarchy
 The following commands create a file system named
 .Ar pool/home
 and a file system named
 .Ar pool/home/bob .
 The mount point
 .Pa /export/home
 is set for the parent file system, and is automatically inherited by the child
 file system.
 .Dl # Nm zfs Cm create Ar pool/home
 .Dl # Nm zfs Cm set Sy mountpoint Ns = Ns Ar /export/home pool/home
 .Dl # Nm zfs Cm create Ar pool/home/bob
 .
 .Ss Example 2 : No Disabling and Enabling File System Compression
 The following command disables the
 .Sy compression
 property for all file systems under
 .Ar pool/home .
 The next command explicitly enables
 .Sy compression
 for
 .Ar pool/home/anne .
 .Dl # Nm zfs Cm set Sy compression Ns = Ns Sy off Ar pool/home
 .Dl # Nm zfs Cm set Sy compression Ns = Ns Sy on Ar pool/home/anne
 .
 .Ss Example 3 : No Setting a Quota on a ZFS File System
 The following command sets a quota of 50 Gbytes for
 .Ar pool/home/bob :
 .Dl # Nm zfs Cm set Sy quota Ns = Ns Ar 50G pool/home/bob
 .
 .Ss Example 4 : No Listing ZFS Properties
 The following command lists all properties for
 .Ar pool/home/bob :
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Sy all Ar pool/home/bob
 NAME           PROPERTY              VALUE                  SOURCE
 pool/home/bob  type                  filesystem             -
 pool/home/bob  creation              Tue Jul 21 15:53 2009  -
 pool/home/bob  used                  21K                    -
 pool/home/bob  available             20.0G                  -
 pool/home/bob  referenced            21K                    -
 pool/home/bob  compressratio         1.00x                  -
 pool/home/bob  mounted               yes                    -
 pool/home/bob  quota                 20G                    local
 pool/home/bob  reservation           none                   default
 pool/home/bob  recordsize            128K                   default
 pool/home/bob  mountpoint            /pool/home/bob         default
 pool/home/bob  sharenfs              off                    default
 pool/home/bob  checksum              on                     default
 pool/home/bob  compression           on                     local
 pool/home/bob  atime                 on                     default
 pool/home/bob  devices               on                     default
 pool/home/bob  exec                  on                     default
 pool/home/bob  setuid                on                     default
 pool/home/bob  readonly              off                    default
 pool/home/bob  zoned                 off                    default
 pool/home/bob  snapdir               hidden                 default
 pool/home/bob  acltype               off                    default
 pool/home/bob  aclmode               discard                default
 pool/home/bob  aclinherit            restricted             default
 pool/home/bob  canmount              on                     default
 pool/home/bob  xattr                 on                     default
 pool/home/bob  copies                1                      default
 pool/home/bob  version               4                      -
 pool/home/bob  utf8only              off                    -
 pool/home/bob  normalization         none                   -
 pool/home/bob  casesensitivity       sensitive              -
 pool/home/bob  vscan                 off                    default
 pool/home/bob  nbmand                off                    default
 pool/home/bob  sharesmb              off                    default
 pool/home/bob  refquota              none                   default
 pool/home/bob  refreservation        none                   default
 pool/home/bob  primarycache          all                    default
 pool/home/bob  secondarycache        all                    default
 pool/home/bob  usedbysnapshots       0                      -
 pool/home/bob  usedbydataset         21K                    -
 pool/home/bob  usedbychildren        0                      -
 pool/home/bob  usedbyrefreservation  0                      -
 .Ed
 .Pp
 The following command gets a single property value:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Fl H o Sy value compression Ar pool/home/bob
 on
 .Ed
 .Pp
 The following command gets a single property value recursively in JSON format:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Fl j Fl r Sy mountpoint Ar pool/home | Nm jq
 {
   "output_version": {
     "command": "zfs get",
     "vers_major": 0,
     "vers_minor": 1
   },
   "datasets": {
     "pool/home": {
       "name": "pool/home",
       "type": "FILESYSTEM",
       "pool": "pool",
       "createtxg": "10",
       "properties": {
         "mountpoint": {
           "value": "/pool/home",
           "source": {
             "type": "DEFAULT",
             "data": "-"
           }
         }
       }
     },
     "pool/home/bob": {
       "name": "pool/home/bob",
       "type": "FILESYSTEM",
       "pool": "pool",
       "createtxg": "1176",
       "properties": {
         "mountpoint": {
           "value": "/pool/home/bob",
           "source": {
             "type": "DEFAULT",
             "data": "-"
           }
         }
       }
     }
   }
 }
 .Ed
 .Pp
 The following command lists all properties with local settings for
 .Ar pool/home/bob :
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Fl r s Sy local Fl o Sy name , Ns Sy property , Ns Sy value all Ar pool/home/bob
 NAME           PROPERTY              VALUE
 pool/home/bob  quota                 20G
 pool/home/bob  compression           on
 .Ed
 .
 .Ss Example 5 : No Inheriting ZFS Properties
 The following command causes
 .Ar pool/home/bob No and Ar pool/home/anne
 to inherit the
 .Sy checksum
 property from their parent.
 .Dl # Nm zfs Cm inherit Sy checksum Ar pool/home/bob pool/home/anne
 .
 .Ss Example 6 : No Setting User Properties
 The following example sets the user-defined
 .Ar com.example : Ns Ar department
 property for a dataset:
 .Dl # Nm zfs Cm set Ar com.example : Ns Ar department Ns = Ns Ar 12345 tank/accounting
 .
 .Ss Example 7 : No Setting sharenfs Property Options on a ZFS File System
 The following commands show how to set
 .Sy sharenfs
 property options to enable read-write
 access for a set of IP addresses and to enable root access for system
 .Qq neo
 on the
 .Ar tank/home
 file system:
 .Dl # Nm zfs Cm set Sy sharenfs Ns = Ns ' Ns Ar rw Ns =@123.123.0.0/16:[::1],root= Ns Ar neo Ns ' tank/home
 .Pp
 If you are using DNS for host name resolution,
 specify the fully-qualified hostname.
 .
 .Sh SEE ALSO
 .Xr zfsprops 7 ,
 .Xr zfs-list 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-share.8 b/sys/contrib/openzfs/man/man8/zfs-share.8
index f7a09a189182..e9c32a44b0c7 100644
--- a/sys/contrib/openzfs/man/man8/zfs-share.8
+++ b/sys/contrib/openzfs/man/man8/zfs-share.8
@@ -1,101 +1,101 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd May 17, 2021
+.Dd July 11, 2022
 .Dt ZFS-SHARE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-share
 .Nd share and unshare ZFS filesystems
 .Sh SYNOPSIS
 .Nm zfs
 .Cm share
 .Op Fl l
 .Fl a Ns | Ns Ar filesystem
 .Nm zfs
 .Cm unshare
 .Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm share
 .Op Fl l
 .Fl a Ns | Ns Ar filesystem
 .Xc
 Shares available ZFS file systems.
 .Bl -tag -width "-a"
 .It Fl l
 Load keys for encrypted filesystems as they are being mounted.
 This is equivalent to executing
 .Nm zfs Cm load-key
 on each encryption root before mounting it.
 Note that if a filesystem has
 .Sy keylocation Ns = Ns Sy prompt ,
 this will cause the terminal to interactively block after asking for the key.
 .It Fl a
 Share all available ZFS file systems.
 Invoked automatically as part of the boot process.
 .It Ar filesystem
 Share the specified filesystem according to the
 .Sy sharenfs
 and
 .Sy sharesmb
 properties.
 File systems are shared when the
 .Sy sharenfs
 or
 .Sy sharesmb
 property is set.
 .El
 .It Xo
 .Nm zfs
 .Cm unshare
 .Fl a Ns | Ns Ar filesystem Ns | Ns Ar mountpoint
 .Xc
 Unshares currently shared ZFS file systems.
 .Bl -tag -width "-a"
 .It Fl a
 Unshare all available ZFS file systems.
 Invoked automatically as part of the shutdown process.
 .It Ar filesystem Ns | Ns Ar mountpoint
 Unshare the specified filesystem.
 The command can also be given a path to a ZFS file system shared on the system.
 .El
 .El
 .
 .Sh SEE ALSO
 .Xr exports 5 ,
 .Xr smb.conf 5 ,
 .Xr zfsprops 7
diff --git a/sys/contrib/openzfs/man/man8/zfs-snapshot.8 b/sys/contrib/openzfs/man/man8/zfs-snapshot.8
index 3ddd1273c8e8..8f4b2c335f09 100644
--- a/sys/contrib/openzfs/man/man8/zfs-snapshot.8
+++ b/sys/contrib/openzfs/man/man8/zfs-snapshot.8
@@ -1,143 +1,143 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZFS-SNAPSHOT 8
 .Os
 .
 .Sh NAME
 .Nm zfs-snapshot
 .Nd create snapshots of ZFS datasets
 .Sh SYNOPSIS
 .Nm zfs
 .Cm snapshot
 .Op Fl r
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Ar dataset Ns @ Ns Ar snapname Ns …
 .
 .Sh DESCRIPTION
 Creates a snapshot of a dataset or multiple snapshots of different
 datasets.
 .Pp
 Snapshots are created atomically.
 That is, a snapshot is a consistent image of a dataset at a specific
 point in time; it includes all modifications to the dataset made by
 system calls that have successfully completed before that point in time.
 Recursive snapshots created through the
 .Fl r
 option are all created at the same time.
 .Pp
 .Nm zfs Cm snap
 can be used as an alias for
 .Nm zfs Cm snapshot .
 .Pp
 See the
 .Sx Snapshots
 section of
 .Xr zfsconcepts 7
 for details.
 .Bl -tag -width "-o"
 .It Fl o Ar property Ns = Ns Ar value
 Set the specified property; see
 .Nm zfs Cm create
 for details.
 .It Fl r
 Recursively create snapshots of all descendent datasets
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 2, 3, 10, 15 from zfs.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating a ZFS Snapshot
 The following command creates a snapshot named
 .Ar yesterday .
 This snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of the
 .Ar pool/home/bob
 file system.
 .Dl # Nm zfs Cm snapshot Ar pool/home/bob Ns @ Ns Ar yesterday
 .
 .Ss Example 2 : No Creating and Destroying Multiple Snapshots
 The following command creates snapshots named
 .Ar yesterday No of Ar pool/home
 and all of its descendent file systems.
 Each snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of its file system.
 The second command destroys the newly created snapshots.
 .Dl # Nm zfs Cm snapshot Fl r Ar pool/home Ns @ Ns Ar yesterday
 .Dl # Nm zfs Cm destroy Fl r Ar pool/home Ns @ Ns Ar yesterday
 .
 .Ss Example 3 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Ss Example 4 : No Performing a Rolling Snapshot
 The following example shows how to maintain a history of snapshots with a
 consistent naming scheme.
 To keep a week's worth of snapshots, the user destroys the oldest snapshot,
 renames the remaining snapshots, and then creates a new snapshot, as follows:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm destroy Fl r Ar pool/users@7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@6daysago No @ Ns Ar 7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@5daysago No @ Ns Ar 6daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@4daysago No @ Ns Ar 5daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@3daysago No @ Ns Ar 4daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@2daysago No @ Ns Ar 3daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@yesterday No @ Ns Ar 2daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@today No @ Ns Ar yesterday
 .No # Nm zfs Cm snapshot Fl r Ar pool/users Ns @ Ns Ar today
 .Ed
 .
 .Sh SEE ALSO
 .Xr zfs-bookmark 8 ,
 .Xr zfs-clone 8 ,
 .Xr zfs-destroy 8 ,
 .Xr zfs-diff 8 ,
 .Xr zfs-hold 8 ,
 .Xr zfs-rename 8 ,
 .Xr zfs-rollback 8 ,
 .Xr zfs-send 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-upgrade.8 b/sys/contrib/openzfs/man/man8/zfs-upgrade.8
index bac74e37aef9..a5ce2b760da4 100644
--- a/sys/contrib/openzfs/man/man8/zfs-upgrade.8
+++ b/sys/contrib/openzfs/man/man8/zfs-upgrade.8
@@ -1,104 +1,104 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd June 30, 2019
+.Dd July 11, 2022
 .Dt ZFS-UPGRADE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-upgrade
 .Nd manage on-disk version of ZFS filesystems
 .Sh SYNOPSIS
 .Nm zfs
 .Cm upgrade
 .Nm zfs
 .Cm upgrade
 .Fl v
 .Nm zfs
 .Cm upgrade
 .Op Fl r
 .Op Fl V Ar version
 .Fl a Ns | Ns Ar filesystem
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm upgrade
 .Xc
 Displays a list of file systems that are not the most recent version.
 .It Xo
 .Nm zfs
 .Cm upgrade
 .Fl v
 .Xc
 Displays a list of currently supported file system versions.
 .It Xo
 .Nm zfs
 .Cm upgrade
 .Op Fl r
 .Op Fl V Ar version
 .Fl a Ns | Ns Ar filesystem
 .Xc
 Upgrades file systems to a new on-disk version.
 Once this is done, the file systems will no longer be accessible on systems
 running older versions of ZFS.
 .Nm zfs Cm send
 streams generated from new snapshots of these file systems cannot be accessed on
 systems running older versions of ZFS.
 .Pp
 In general, the file system version is independent of the pool version.
 See
 .Xr zpool-features 7
 for information on features of ZFS storage pools.
 .Pp
 In some cases, the file system version and the pool version are interrelated and
 the pool version must be upgraded before the file system version can be
 upgraded.
 .Bl -tag -width "filesystem"
 .It Fl V Ar version
 Upgrade to
 .Ar version .
 If not specified, upgrade to the most recent version.
 This
 option can only be used to increase the version number, and only up to the most
 recent version supported by this version of ZFS.
 .It Fl a
 Upgrade all file systems on all imported pools.
 .It Ar filesystem
 Upgrade the specified file system.
 .It Fl r
 Upgrade the specified file system and all descendent file systems.
 .El
 .El
 .Sh SEE ALSO
 .Xr zpool-upgrade 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-userspace.8 b/sys/contrib/openzfs/man/man8/zfs-userspace.8
index d7a4d18e83b1..c255d911740d 100644
--- a/sys/contrib/openzfs/man/man8/zfs-userspace.8
+++ b/sys/contrib/openzfs/man/man8/zfs-userspace.8
@@ -1,189 +1,189 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd June 30, 2019
+.Dd July 11, 2022
 .Dt ZFS-USERSPACE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-userspace
 .Nd display space and quotas of ZFS dataset
 .Sh SYNOPSIS
 .Nm zfs
 .Cm userspace
 .Op Fl Hinp
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar field Oc Ns …
 .Oo Fl S Ar field Oc Ns …
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path
 .Nm zfs
 .Cm groupspace
 .Op Fl Hinp
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar field Oc Ns …
 .Oo Fl S Ar field Oc Ns …
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path
 .Nm zfs
 .Cm projectspace
 .Op Fl Hp
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar field Oc Ns …
 .Oo Fl S Ar field Oc Ns …
 .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm userspace
 .Op Fl Hinp
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar field Oc Ns …
 .Oo Fl S Ar field Oc Ns …
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path
 .Xc
 Displays space consumed by, and quotas on, each user in the specified
 filesystem,
 snapshot, or path.
 If a path is given, the filesystem that contains that path will be used.
 This corresponds to the
 .Sy userused@ Ns Em user ,
 .Sy userobjused@ Ns Em user ,
 .Sy userquota@ Ns Em user ,
 and
 .Sy userobjquota@ Ns Em user
 properties.
 .Bl -tag -width "-S field"
 .It Fl H
 Do not print headers, use tab-delimited output.
 .It Fl S Ar field
 Sort by this field in reverse order.
 See
 .Fl s .
 .It Fl i
 Translate SID to POSIX ID.
 The POSIX ID may be ephemeral if no mapping exists.
 Normal POSIX interfaces
 .Pq like Xr stat 2 , Nm ls Fl l
 perform this translation, so the
 .Fl i
 option allows the output from
 .Nm zfs Cm userspace
 to be compared directly with those utilities.
 However,
 .Fl i
 may lead to confusion if some files were created by an SMB user before a
 SMB-to-POSIX name mapping was established.
 In such a case, some files will be owned by the SMB entity and some by the POSIX
 entity.
 However, the
 .Fl i
 option will report that the POSIX entity has the total usage and quota for both.
 .It Fl n
 Print numeric ID instead of user/group name.
 .It Fl o Ar field Ns Oo , Ns Ar field Oc Ns …
 Display only the specified fields from the following set:
 .Sy type ,
 .Sy name ,
 .Sy used ,
 .Sy quota .
 The default is to display all fields.
 .It Fl p
 Use exact
 .Pq parsable
 numeric output.
 .It Fl s Ar field
 Sort output by this field.
 The
 .Fl s
 and
 .Fl S
 flags may be specified multiple times to sort first by one field, then by
 another.
 The default is
 .Fl s Sy type Fl s Sy name .
 .It Fl t Ar type Ns Oo , Ns Ar type Oc Ns …
 Print only the specified types from the following set:
 .Sy all ,
 .Sy posixuser ,
 .Sy smbuser ,
 .Sy posixgroup ,
 .Sy smbgroup .
 The default is
 .Fl t Sy posixuser , Ns Sy smbuser .
 The default can be changed to include group types.
 .El
 .It Xo
 .Nm zfs
 .Cm groupspace
 .Op Fl Hinp
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar field Oc Ns …
 .Oo Fl S Ar field Oc Ns …
 .Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns … Oc
 .Ar filesystem Ns | Ns Ar snapshot
 .Xc
 Displays space consumed by, and quotas on, each group in the specified
 filesystem or snapshot.
 This subcommand is identical to
 .Cm userspace ,
 except that the default types to display are
 .Fl t Sy posixgroup , Ns Sy smbgroup .
 .It Xo
 .Nm zfs
 .Cm projectspace
 .Op Fl Hp
 .Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns … Oc
 .Oo Fl s Ar field Oc Ns …
 .Oo Fl S Ar field Oc Ns …
 .Ar filesystem Ns | Ns Ar snapshot Ns | Ns Ar path
 .Xc
 Displays space consumed by, and quotas on, each project in the specified
 filesystem or snapshot.
 This subcommand is identical to
 .Cm userspace ,
 except that the project identifier is a numeral, not a name.
 So need neither the option
 .Fl i
 for SID to POSIX ID nor
 .Fl n
 for numeric ID, nor
 .Fl t
 for types.
 .El
 .
 .Sh SEE ALSO
 .Xr zfsprops 7 ,
 .Xr zfs-set 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-wait.8 b/sys/contrib/openzfs/man/man8/zfs-wait.8
index 554a67455c60..e5c60010d2f9 100644
--- a/sys/contrib/openzfs/man/man8/zfs-wait.8
+++ b/sys/contrib/openzfs/man/man8/zfs-wait.8
@@ -1,66 +1,66 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 31, 2021
+.Dd July 11, 2022
 .Dt ZFS-WAIT 8
 .Os
 .
 .Sh NAME
 .Nm zfs-wait
 .Nd wait for activity in ZFS filesystem to stop
 .Sh SYNOPSIS
 .Nm zfs
 .Cm wait
 .Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns …
 .Ar filesystem
 .
 .Sh DESCRIPTION
 Waits until all background activity of the given types has ceased in the given
 filesystem.
 The activity could cease because it has completed or because the filesystem has
 been destroyed or unmounted.
 If no activities are specified, the command waits until background activity of
 every type listed below has ceased.
 If there is no activity of the given types in progress, the command returns
 immediately.
 .Pp
 These are the possible values for
 .Ar activity ,
 along with what each one waits for:
 .Bl -tag -compact -offset Ds -width "deleteq"
 .It Sy deleteq
 The filesystem's internal delete queue to empty
 .El
 .Pp
 Note that the internal delete queue does not finish draining until
 all large files have had time to be fully destroyed and all open file
 handles to unlinked files are closed.
 .
 .Sh SEE ALSO
 .Xr lsof 8
diff --git a/sys/contrib/openzfs/man/man8/zfs-zone.8 b/sys/contrib/openzfs/man/man8/zfs-zone.8
index 7ad0ac89463c..a56a304e82b2 100644
--- a/sys/contrib/openzfs/man/man8/zfs-zone.8
+++ b/sys/contrib/openzfs/man/man8/zfs-zone.8
@@ -1,117 +1,117 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
 .\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
 .\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
 .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
 .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\" Copyright 2021 Klara, Inc.
 .\"
-.Dd June 3, 2022
+.Dd July 11, 2022
 .Dt ZFS-ZONE 8
 .Os
 .
 .Sh NAME
 .Nm zfs-zone ,
 .Nm zfs-unzone
 .Nd attach and detach ZFS filesystems to user namespaces
 .Sh SYNOPSIS
 .Nm zfs Cm zone
 .Ar nsfile
 .Ar filesystem
 .Nm zfs Cm unzone
 .Ar nsfile
 .Ar filesystem
 .
 .Sh DESCRIPTION
 .Bl -tag -width ""
 .It Xo
 .Nm zfs
 .Cm zone
 .Ar nsfile
 .Ar filesystem
 .Xc
 Attach the specified
 .Ar filesystem
 to the user namespace identified by
 .Ar nsfile .
 From now on this file system tree can be managed from within a user namespace
 if the
 .Sy zoned
 property has been set.
 .Pp
 You cannot attach a zoned dataset's children to another user namespace.
 You can also not attach the root file system
 of the user namespace or any dataset
 which needs to be mounted before the zfs service
 is run inside the user namespace,
 as it would be attached unmounted until it is
 mounted from the service inside the user namespace.
 .Pp
 To allow management of the dataset from within a user namespace, the
 .Sy zoned
 property has to be set and the user namespaces needs access to the
 .Pa /dev/zfs
 device.
 The
 .Sy quota
 property cannot be changed from within a user namespace.
 .Pp
 After a dataset is attached to a user namespace and the
 .Sy zoned
 property is set,
 a zoned file system cannot be mounted outside the user namespace,
 since the user namespace administrator might have set the mount point
 to an unacceptable value.
 .It Xo
 .Nm zfs
 .Cm unzone
 .Ar nsfile
 .Ar filesystem
 .Xc
 Detach the specified
 .Ar filesystem
 from the user namespace identified by
 .Ar nsfile .
 .El
 .Sh EXAMPLES
 .Ss Example 1 : No Delegating a Dataset to a User Namespace
 The following example delegates the
 .Ar tank/users
 dataset to a user namespace identified by user namespace file
 .Pa /proc/1234/ns/user .
 .Dl # Nm zfs Cm zone Ar /proc/1234/ns/user Ar tank/users
 .
 .Sh SEE ALSO
 .Xr zfsprops 7
diff --git a/sys/contrib/openzfs/man/man8/zfs.8 b/sys/contrib/openzfs/man/man8/zfs.8
index e16a3a82b672..b7566a727469 100644
--- a/sys/contrib/openzfs/man/man8/zfs.8
+++ b/sys/contrib/openzfs/man/man8/zfs.8
@@ -1,845 +1,845 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
 .\" Copyright (c) 2011, 2019 by Delphix. All rights reserved.
 .\" Copyright (c) 2011, Pawel Jakub Dawidek <pjd@FreeBSD.org>
 .\" Copyright (c) 2012, Glen Barber <gjb@FreeBSD.org>
 .\" Copyright (c) 2012, Bryan Drewery <bdrewery@FreeBSD.org>
 .\" Copyright (c) 2013, Steven Hartland <smh@FreeBSD.org>
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" Copyright (c) 2014 by Adam Stevko. All rights reserved.
 .\" Copyright (c) 2014 Integros [integros.com]
 .\" Copyright (c) 2014, Xin LI <delphij@FreeBSD.org>
 .\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved.
 .\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved.
 .\" Copyright 2019 Richard Laager. All rights reserved.
 .\" Copyright 2018 Nexenta Systems, Inc.
 .\" Copyright 2019 Joyent, Inc.
 .\"
-.Dd April 18, 2025
+.Dd May 12, 2025
 .Dt ZFS 8
 .Os
 .
 .Sh NAME
 .Nm zfs
 .Nd configure ZFS datasets
 .Sh SYNOPSIS
 .Nm
 .Fl ?V
 .Nm
 .Cm version
 .Op Fl j
 .Nm
 .Cm subcommand
 .Op Ar arguments
 .
 .Sh DESCRIPTION
 The
 .Nm
 command configures ZFS datasets within a ZFS storage pool, as described in
 .Xr zpool 8 .
 A dataset is identified by a unique path within the ZFS namespace:
 .Pp
 .D1 Ar pool Ns Oo Sy / Ns Ar component Oc Ns Sy / Ns Ar component
 .Pp
 for example:
 .Pp
 .Dl rpool/var/log
 .Pp
 The maximum length of a dataset name is
 .Sy ZFS_MAX_DATASET_NAME_LEN No - 1
 ASCII characters (currently 255) satisfying
 .Sy [A-Za-z_.:/ -] .
 Additionally snapshots are allowed to contain a single
 .Sy @
 character, while bookmarks are allowed to contain a single
 .Sy #
 character.
 .Sy /
 is used as separator between components.
 The maximum amount of nesting allowed in a path is
 .Sy zfs_max_dataset_nesting
 levels deep.
 ZFS tunables
 .Pq Sy zfs_*
 are explained in
 .Xr zfs 4 .
 .Pp
 A dataset can be one of the following:
 .Bl -tag -offset Ds -width "file system"
 .It Sy file system
 Can be mounted within the standard system namespace and behaves like other file
 systems.
 While ZFS file systems are designed to be POSIX-compliant, known issues exist
 that prevent compliance in some cases.
 Applications that depend on standards conformance might fail due to non-standard
 behavior when checking file system free space.
 .It Sy volume
 A logical volume exported as a raw or block device.
 This type of dataset should only be used when a block device is required.
 File systems are typically used in most environments.
 .It Sy snapshot
 A read-only version of a file system or volume at a given point in time.
 It is specified as
 .Ar filesystem Ns @ Ns Ar name
 or
 .Ar volume Ns @ Ns Ar name .
 .It Sy bookmark
 Much like a
 .Sy snapshot ,
 but without the hold on on-disk data.
 It can be used as the source of a send (but not for a receive).
 It is specified as
 .Ar filesystem Ns # Ns Ar name
 or
 .Ar volume Ns # Ns Ar name .
 .El
 .Pp
 See
 .Xr zfsconcepts 7
 for details.
 .
 .Ss Properties
 Properties are divided into two types: native properties and user-defined
 .Pq or Qq user
 properties.
 Native properties either export internal statistics or control ZFS behavior.
 In addition, native properties are either editable or read-only.
 User properties have no effect on ZFS behavior, but you can use them to annotate
 datasets in a way that is meaningful in your environment.
 For more information about properties, see
 .Xr zfsprops 7 .
 .
 .Ss Encryption
 Enabling the
 .Sy encryption
 feature allows for the creation of encrypted filesystems and volumes.
 ZFS will encrypt file and zvol data, file attributes, ACLs, permission bits,
 directory listings, FUID mappings, and
 .Sy userused Ns / Ns Sy groupused Ns / Ns Sy projectused
 data.
 For an overview of encryption, see
 .Xr zfs-load-key 8 .
 .
 .Sh SUBCOMMANDS
 All subcommands that modify state are logged persistently to the pool in their
 original form.
 .Bl -tag -width ""
 .It Nm Fl ?
 Displays a help message.
 .It Xo
 .Nm
 .Fl V , -version
 .Xc
 .It Xo
 .Nm
 .Cm version
 .Op Fl j
 .Xc
 Displays the software version of the
 .Nm
 userland utility and the zfs kernel module.
 Use
 .Fl j
 option to output in JSON format.
 .El
 .
 .Ss Dataset Management
 .Bl -tag -width ""
 .It Xr zfs-list 8
 Lists the property information for the given datasets in tabular form.
 .It Xr zfs-create 8
 Creates a new ZFS file system or volume.
 .It Xr zfs-destroy 8
 Destroys the given dataset(s), snapshot(s), or bookmark.
 .It Xr zfs-rename 8
 Renames the given dataset (filesystem or snapshot).
 .It Xr zfs-upgrade 8
 Manage upgrading the on-disk version of filesystems.
 .El
 .
 .Ss Snapshots
 .Bl -tag -width ""
 .It Xr zfs-snapshot 8
 Creates snapshots with the given names.
 .It Xr zfs-rollback 8
 Roll back the given dataset to a previous snapshot.
 .It Xr zfs-hold 8 Ns / Ns Xr zfs-release 8
 Add or remove a hold reference to the specified snapshot or snapshots.
 If a hold exists on a snapshot, attempts to destroy that snapshot by using the
 .Nm zfs Cm destroy
 command return
 .Sy EBUSY .
 .It Xr zfs-diff 8
 Display the difference between a snapshot of a given filesystem and another
 snapshot of that filesystem from a later time or the current contents of the
 filesystem.
 .El
 .
 .Ss Clones
 .Bl -tag -width ""
 .It Xr zfs-clone 8
 Creates a clone of the given snapshot.
 .It Xr zfs-promote 8
 Promotes a clone file system to no longer be dependent on its
 .Qq origin
 snapshot.
 .El
 .
 .Ss Send & Receive
 .Bl -tag -width ""
 .It Xr zfs-send 8
 Generate a send stream, which may be of a filesystem, and may be incremental
 from a bookmark.
 .It Xr zfs-receive 8
 Creates a snapshot whose contents are as specified in the stream provided on
 standard input.
 If a full stream is received, then a new file system is created as well.
 Streams are created using the
 .Xr zfs-send 8
 subcommand, which by default creates a full stream.
 .It Xr zfs-bookmark 8
 Creates a new bookmark of the given snapshot or bookmark.
 Bookmarks mark the point in time when the snapshot was created, and can be used
 as the incremental source for a
 .Nm zfs Cm send
 command.
 .It Xr zfs-redact 8
 Generate a new redaction bookmark.
 This feature can be used to allow clones of a filesystem to be made available on
 a remote system, in the case where their parent need not (or needs to not) be
 usable.
 .El
 .
 .Ss Properties
 .Bl -tag -width ""
 .It Xr zfs-get 8
 Displays properties for the given datasets.
 .It Xr zfs-set 8
 Sets the property or list of properties to the given value(s) for each dataset.
 .It Xr zfs-inherit 8
 Clears the specified property, causing it to be inherited from an ancestor,
 restored to default if no ancestor has the property set, or with the
 .Fl S
 option reverted to the received value if one exists.
 .El
 .
 .Ss Quotas
 .Bl -tag -width ""
 .It Xr zfs-userspace 8 Ns / Ns Xr zfs-groupspace 8 Ns / Ns Xr zfs-projectspace 8
 Displays space consumed by, and quotas on, each user, group, or project
 in the specified filesystem or snapshot.
 .It Xr zfs-project 8
 List, set, or clear project ID and/or inherit flag on the files or directories.
 .El
 .
 .Ss Mountpoints
 .Bl -tag -width ""
 .It Xr zfs-mount 8
 Displays all ZFS file systems currently mounted, or mount ZFS filesystem
 on a path described by its
 .Sy mountpoint
 property.
 .It Xr zfs-unmount 8
 Unmounts currently mounted ZFS file systems.
 .El
 .
 .Ss Shares
 .Bl -tag -width ""
 .It Xr zfs-share 8
 Shares available ZFS file systems.
 .It Xr zfs-unshare 8
 Unshares currently shared ZFS file systems.
 .El
 .
 .Ss Delegated Administration
 .Bl -tag -width ""
 .It Xr zfs-allow 8
 Delegate permissions on the specified filesystem or volume.
 .It Xr zfs-unallow 8
 Remove delegated permissions on the specified filesystem or volume.
 .El
 .
 .Ss Encryption
 .Bl -tag -width ""
 .It Xr zfs-change-key 8
 Add or change an encryption key on the specified dataset.
 .It Xr zfs-load-key 8
 Load the key for the specified encrypted dataset, enabling access.
 .It Xr zfs-unload-key 8
 Unload a key for the specified dataset,
 removing the ability to access the dataset.
 .El
 .
 .Ss Channel Programs
 .Bl -tag -width ""
 .It Xr zfs-program 8
 Execute ZFS administrative operations
 programmatically via a Lua script-language channel program.
 .El
 .
 .Ss Data rewrite
 .Bl -tag -width ""
 .It Xr zfs-rewrite 8
 Rewrite specified files without modification.
 .El
 .
 .Ss Jails
 .Bl -tag -width ""
 .It Xr zfs-jail 8
 Attaches a filesystem to a jail.
 .It Xr zfs-unjail 8
 Detaches a filesystem from a jail.
 .El
 .
 .Ss Waiting
 .Bl -tag -width ""
 .It Xr zfs-wait 8
 Wait for background activity in a filesystem to complete.
 .El
 .
 .Sh EXIT STATUS
 The
 .Nm
 utility exits
 .Sy 0
 on success,
 .Sy 1
 if an error occurs, and
 .Sy 2
 if invalid command line options were specified.
 .
 .Sh EXAMPLES
 .\" Examples 1, 4, 6, 7, 11, 14, 16 are shared with zfs-set.8.
 .\" Examples 1, 10 are shared with zfs-create.8.
 .\" Examples 2, 3, 10, 15 are also shared with zfs-snapshot.8.
 .\" Examples 3, 10, 15 are shared with zfs-destroy.8.
 .\" Examples 5 are shared with zfs-list.8.
 .\" Examples 8 are shared with zfs-rollback.8.
 .\" Examples 9, 10 are shared with zfs-clone.8.
 .\" Examples 10 are also shared with zfs-promote.8.
 .\" Examples 10, 15 also are shared with zfs-rename.8.
 .\" Examples 12, 13 are shared with zfs-send.8.
 .\" Examples 12, 13 are also shared with zfs-receive.8.
 .\" Examples 17, 18, 19, 20, 21 are shared with zfs-allow.8.
 .\" Examples 22 are shared with zfs-diff.8.
 .\" Examples 23 are shared with zfs-bookmark.8.
 .\" Make sure to update them omnidirectionally
 .Ss Example 1 : No Creating a ZFS File System Hierarchy
 The following commands create a file system named
 .Ar pool/home
 and a file system named
 .Ar pool/home/bob .
 The mount point
 .Pa /export/home
 is set for the parent file system, and is automatically inherited by the child
 file system.
 .Dl # Nm zfs Cm create Ar pool/home
 .Dl # Nm zfs Cm set Sy mountpoint Ns = Ns Ar /export/home pool/home
 .Dl # Nm zfs Cm create Ar pool/home/bob
 .
 .Ss Example 2 : No Creating a ZFS Snapshot
 The following command creates a snapshot named
 .Ar yesterday .
 This snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of the
 .Ar pool/home/bob
 file system.
 .Dl # Nm zfs Cm snapshot Ar pool/home/bob Ns @ Ns Ar yesterday
 .
 .Ss Example 3 : No Creating and Destroying Multiple Snapshots
 The following command creates snapshots named
 .Ar yesterday No of Ar pool/home
 and all of its descendent file systems.
 Each snapshot is mounted on demand in the
 .Pa .zfs/snapshot
 directory at the root of its file system.
 The second command destroys the newly created snapshots.
 .Dl # Nm zfs Cm snapshot Fl r Ar pool/home Ns @ Ns Ar yesterday
 .Dl # Nm zfs Cm destroy Fl r Ar pool/home Ns @ Ns Ar yesterday
 .
 .Ss Example 4 : No Disabling and Enabling File System Compression
 The following command disables the
 .Sy compression
 property for all file systems under
 .Ar pool/home .
 The next command explicitly enables
 .Sy compression
 for
 .Ar pool/home/anne .
 .Dl # Nm zfs Cm set Sy compression Ns = Ns Sy off Ar pool/home
 .Dl # Nm zfs Cm set Sy compression Ns = Ns Sy on Ar pool/home/anne
 .
 .Ss Example 5 : No Listing ZFS Datasets
 The following command lists all active file systems and volumes in the system.
 Snapshots are displayed if
 .Sy listsnaps Ns = Ns Sy on .
 The default is
 .Sy off .
 See
 .Xr zpoolprops 7
 for more information on pool properties.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm list
 NAME                      USED  AVAIL  REFER  MOUNTPOINT
 pool                      450K   457G    18K  /pool
 pool/home                 315K   457G    21K  /export/home
 pool/home/anne             18K   457G    18K  /export/home/anne
 pool/home/bob             276K   457G   276K  /export/home/bob
 .Ed
 .
 .Ss Example 6 : No Setting a Quota on a ZFS File System
 The following command sets a quota of 50 Gbytes for
 .Ar pool/home/bob :
 .Dl # Nm zfs Cm set Sy quota Ns = Ns Ar 50G pool/home/bob
 .
 .Ss Example 7 : No Listing ZFS Properties
 The following command lists all properties for
 .Ar pool/home/bob :
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Sy all Ar pool/home/bob
 NAME           PROPERTY              VALUE                  SOURCE
 pool/home/bob  type                  filesystem             -
 pool/home/bob  creation              Tue Jul 21 15:53 2009  -
 pool/home/bob  used                  21K                    -
 pool/home/bob  available             20.0G                  -
 pool/home/bob  referenced            21K                    -
 pool/home/bob  compressratio         1.00x                  -
 pool/home/bob  mounted               yes                    -
 pool/home/bob  quota                 20G                    local
 pool/home/bob  reservation           none                   default
 pool/home/bob  recordsize            128K                   default
 pool/home/bob  mountpoint            /pool/home/bob         default
 pool/home/bob  sharenfs              off                    default
 pool/home/bob  checksum              on                     default
 pool/home/bob  compression           on                     local
 pool/home/bob  atime                 on                     default
 pool/home/bob  devices               on                     default
 pool/home/bob  exec                  on                     default
 pool/home/bob  setuid                on                     default
 pool/home/bob  readonly              off                    default
 pool/home/bob  zoned                 off                    default
 pool/home/bob  snapdir               hidden                 default
 pool/home/bob  acltype               off                    default
 pool/home/bob  aclmode               discard                default
 pool/home/bob  aclinherit            restricted             default
 pool/home/bob  canmount              on                     default
 pool/home/bob  xattr                 on                     default
 pool/home/bob  copies                1                      default
 pool/home/bob  version               4                      -
 pool/home/bob  utf8only              off                    -
 pool/home/bob  normalization         none                   -
 pool/home/bob  casesensitivity       sensitive              -
 pool/home/bob  vscan                 off                    default
 pool/home/bob  nbmand                off                    default
 pool/home/bob  sharesmb              off                    default
 pool/home/bob  refquota              none                   default
 pool/home/bob  refreservation        none                   default
 pool/home/bob  primarycache          all                    default
 pool/home/bob  secondarycache        all                    default
 pool/home/bob  usedbysnapshots       0                      -
 pool/home/bob  usedbydataset         21K                    -
 pool/home/bob  usedbychildren        0                      -
 pool/home/bob  usedbyrefreservation  0                      -
 .Ed
 .Pp
 The following command gets a single property value:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Fl H o Sy value compression Ar pool/home/bob
 on
 .Ed
 .Pp
 The following command lists all properties with local settings for
 .Ar pool/home/bob :
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm get Fl r s Sy local Fl o Sy name , Ns Sy property , Ns Sy value all Ar pool/home/bob
 NAME           PROPERTY              VALUE
 pool/home/bob  quota                 20G
 pool/home/bob  compression           on
 .Ed
 .
 .Ss Example 8 : No Rolling Back a ZFS File System
 The following command reverts the contents of
 .Ar pool/home/anne
 to the snapshot named
 .Ar yesterday ,
 deleting all intermediate snapshots:
 .Dl # Nm zfs Cm rollback Fl r Ar pool/home/anne Ns @ Ns Ar yesterday
 .
 .Ss Example 9 : No Creating a ZFS Clone
 The following command creates a writable file system whose initial contents are
 the same as
 .Ar pool/home/bob@yesterday .
 .Dl # Nm zfs Cm clone Ar pool/home/bob@yesterday pool/clone
 .
 .Ss Example 10 : No Promoting a ZFS Clone
 The following commands illustrate how to test out changes to a file system, and
 then replace the original file system with the changed one, using clones, clone
 promotion, and renaming:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm create Ar pool/project/production
   populate /pool/project/production with data
 .No # Nm zfs Cm snapshot Ar pool/project/production Ns @ Ns Ar today
 .No # Nm zfs Cm clone Ar pool/project/production@today pool/project/beta
   make changes to /pool/project/beta and test them
 .No # Nm zfs Cm promote Ar pool/project/beta
 .No # Nm zfs Cm rename Ar pool/project/production pool/project/legacy
 .No # Nm zfs Cm rename Ar pool/project/beta pool/project/production
   once the legacy version is no longer needed, it can be destroyed
 .No # Nm zfs Cm destroy Ar pool/project/legacy
 .Ed
 .
 .Ss Example 11 : No Inheriting ZFS Properties
 The following command causes
 .Ar pool/home/bob No and Ar pool/home/anne
 to inherit the
 .Sy checksum
 property from their parent.
 .Dl # Nm zfs Cm inherit Sy checksum Ar pool/home/bob pool/home/anne
 .
 .Ss Example 12 : No Remotely Replicating ZFS Data
 The following commands send a full stream and then an incremental stream to a
 remote machine, restoring them into
 .Em poolB/received/fs@a
 and
 .Em poolB/received/fs@b ,
 respectively.
 .Em poolB
 must contain the file system
 .Em poolB/received ,
 and must not initially contain
 .Em poolB/received/fs .
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm send Ar pool/fs@a |
 .No "   " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs Ns @ Ns Ar a
 .No # Nm zfs Cm send Fl i Ar a pool/fs@b |
 .No "   " Nm ssh Ar host Nm zfs Cm receive Ar poolB/received/fs
 .Ed
 .
 .Ss Example 13 : No Using the Nm zfs Cm receive Fl d No Option
 The following command sends a full stream of
 .Ar poolA/fsA/fsB@snap
 to a remote machine, receiving it into
 .Ar poolB/received/fsA/fsB@snap .
 The
 .Ar fsA/fsB@snap
 portion of the received snapshot's name is determined from the name of the sent
 snapshot.
 .Ar poolB
 must contain the file system
 .Ar poolB/received .
 If
 .Ar poolB/received/fsA
 does not exist, it is created as an empty file system.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm send Ar poolA/fsA/fsB@snap |
 .No "   " Nm ssh Ar host Nm zfs Cm receive Fl d Ar poolB/received
 .Ed
 .
 .Ss Example 14 : No Setting User Properties
 The following example sets the user-defined
 .Ar com.example : Ns Ar department
 property for a dataset:
 .Dl # Nm zfs Cm set Ar com.example : Ns Ar department Ns = Ns Ar 12345 tank/accounting
 .
 .Ss Example 15 : No Performing a Rolling Snapshot
 The following example shows how to maintain a history of snapshots with a
 consistent naming scheme.
 To keep a week's worth of snapshots, the user destroys the oldest snapshot,
 renames the remaining snapshots, and then creates a new snapshot, as follows:
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm destroy Fl r Ar pool/users@7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@6daysago No @ Ns Ar 7daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@5daysago No @ Ns Ar 6daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@4daysago No @ Ns Ar 5daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@3daysago No @ Ns Ar 4daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@2daysago No @ Ns Ar 3daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@yesterday No @ Ns Ar 2daysago
 .No # Nm zfs Cm rename Fl r Ar pool/users@today No @ Ns Ar yesterday
 .No # Nm zfs Cm snapshot Fl r Ar pool/users Ns @ Ns Ar today
 .Ed
 .
 .Ss Example 16 : No Setting sharenfs Property Options on a ZFS File System
 The following commands show how to set
 .Sy sharenfs
 property options to enable read-write
 access for a set of IP addresses and to enable root access for system
 .Qq neo
 on the
 .Ar tank/home
 file system:
 .Dl # Nm zfs Cm set Sy sharenfs Ns = Ns ' Ns Ar rw Ns =@123.123.0.0/16:[::1],root= Ns Ar neo Ns ' tank/home
 .Pp
 If you are using DNS for host name resolution,
 specify the fully-qualified hostname.
 .
 .Ss Example 17 : No Delegating ZFS Administration Permissions on a ZFS Dataset
 The following example shows how to set permissions so that user
 .Ar cindys
 can create, destroy, mount, and take snapshots on
 .Ar tank/cindys .
 The permissions on
 .Ar tank/cindys
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Sy cindys create , Ns Sy destroy , Ns Sy mount , Ns Sy snapshot Ar tank/cindys
 .No # Nm zfs Cm allow Ar tank/cindys
 ---- Permissions on tank/cindys --------------------------------------
 Local+Descendent permissions:
         user cindys create,destroy,mount,snapshot
 .Ed
 .Pp
 Because the
 .Ar tank/cindys
 mount point permission is set to 755 by default, user
 .Ar cindys
 will be unable to mount file systems under
 .Ar tank/cindys .
 Add an ACE similar to the following syntax to provide mount point access:
 .Dl # Cm chmod No A+user : Ns Ar cindys Ns :add_subdirectory:allow Ar /tank/cindys
 .
 .Ss Example 18 : No Delegating Create Time Permissions on a ZFS Dataset
 The following example shows how to grant anyone in the group
 .Ar staff
 to create file systems in
 .Ar tank/users .
 This syntax also allows staff members to destroy their own file systems, but not
 destroy anyone else's file system.
 The permissions on
 .Ar tank/users
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Ar staff Sy create , Ns Sy mount Ar tank/users
 .No # Nm zfs Cm allow Fl c Sy destroy Ar tank/users
 .No # Nm zfs Cm allow Ar tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         destroy
 Local+Descendent permissions:
         group staff create,mount
 .Ed
 .
 .Ss Example 19 : No Defining and Granting a Permission Set on a ZFS Dataset
 The following example shows how to define and grant a permission set on the
 .Ar tank/users
 file system.
 The permissions on
 .Ar tank/users
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Fl s No @ Ns Ar pset Sy create , Ns Sy destroy , Ns Sy snapshot , Ns Sy mount Ar tank/users
 .No # Nm zfs Cm allow staff No @ Ns Ar pset tank/users
 .No # Nm zfs Cm allow Ar tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         @pset create,destroy,mount,snapshot
 Local+Descendent permissions:
         group staff @pset
 .Ed
 .
 .Ss Example 20 : No Delegating Property Permissions on a ZFS Dataset
 The following example shows to grant the ability to set quotas and reservations
 on the
 .Ar users/home
 file system.
 The permissions on
 .Ar users/home
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm allow Ar cindys Sy quota , Ns Sy reservation Ar users/home
 .No # Nm zfs Cm allow Ar users/home
 ---- Permissions on users/home ---------------------------------------
 Local+Descendent permissions:
         user cindys quota,reservation
 cindys% zfs set quota=10G users/home/marks
 cindys% zfs get quota users/home/marks
 NAME              PROPERTY  VALUE  SOURCE
 users/home/marks  quota     10G    local
 .Ed
 .
 .Ss Example 21 : No Removing ZFS Delegated Permissions on a ZFS Dataset
 The following example shows how to remove the snapshot permission from the
 .Ar staff
 group on the
 .Sy tank/users
 file system.
 The permissions on
 .Sy tank/users
 are also displayed.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm unallow Ar staff Sy snapshot Ar tank/users
 .No # Nm zfs Cm allow Ar tank/users
 ---- Permissions on tank/users ---------------------------------------
 Permission sets:
         @pset create,destroy,mount,snapshot
 Local+Descendent permissions:
         group staff @pset
 .Ed
 .
 .Ss Example 22 : No Showing the differences between a snapshot and a ZFS Dataset
 The following example shows how to see what has changed between a prior
 snapshot of a ZFS dataset and its current state.
 The
 .Fl F
 option is used to indicate type information for the files affected.
 .Bd -literal -compact -offset Ds
 .No # Nm zfs Cm diff Fl F Ar tank/test@before tank/test
 M       /       /tank/test/
 M       F       /tank/test/linked      (+1)
 R       F       /tank/test/oldname -> /tank/test/newname
 -       F       /tank/test/deleted
 +       F       /tank/test/created
 M       F       /tank/test/modified
 .Ed
 .
 .Ss Example 23 : No Creating a bookmark
 The following example creates a bookmark to a snapshot.
 This bookmark can then be used instead of a snapshot in send streams.
 .Dl # Nm zfs Cm bookmark Ar rpool Ns @ Ns Ar snapshot rpool Ns # Ns Ar bookmark
 .
 .Ss Example 24 : No Setting Sy sharesmb No Property Options on a ZFS File System
 The following example show how to share SMB filesystem through ZFS.
 Note that a user and their password must be given.
 .Dl # Nm smbmount Ar //127.0.0.1/share_tmp /mnt/tmp Fl o No user=workgroup/turbo,password=obrut,uid=1000
 .Pp
 Minimal
 .Pa /etc/samba/smb.conf
 configuration is required, as follows.
 .Pp
 Samba will need to bind to the loopback interface for the ZFS utilities to
 communicate with Samba.
 This is the default behavior for most Linux distributions.
 .Pp
 Samba must be able to authenticate a user.
 This can be done in a number of ways
 .Pq Xr passwd 5 , LDAP , Xr smbpasswd 5 , &c.\& .
 How to do this is outside the scope of this document – refer to
 .Xr smb.conf 5
 for more information.
 .Pp
 See the
 .Sx USERSHARES
 section for all configuration options,
 in case you need to modify any options of the share afterwards.
 Do note that any changes done with the
 .Xr net 8
 command will be undone if the share is ever unshared (like via a reboot).
 .
 .Sh ENVIRONMENT VARIABLES
 .Bl -tag -width "ZFS_MODULE_TIMEOUT"
 .It Sy ZFS_COLOR
 Use ANSI color in
 .Nm zfs Cm diff
 and
 .Nm zfs Cm list
 output.
 .It Sy ZFS_MOUNT_HELPER
 Cause
 .Nm zfs Cm mount
 to use
 .Xr mount 8
 to mount ZFS datasets.
 This option is provided for backwards compatibility with older ZFS versions.
 .
 .It Sy ZFS_SET_PIPE_MAX
 Tells
 .Nm zfs
 to set the maximum pipe size for sends/receives.
 Disabled by default on Linux
 due to an unfixed deadlock in Linux's pipe size handling code.
 .
 .\" Shared with zpool.8
 .It Sy ZFS_MODULE_TIMEOUT
 Time, in seconds, to wait for
 .Pa /dev/zfs
 to appear.
 Defaults to
 .Sy 10 ,
 max
 .Sy 600 Pq 10 minutes .
 If
 .Pf < Sy 0 ,
 wait forever; if
 .Sy 0 ,
 don't wait.
 .El
 .
 .Sh INTERFACE STABILITY
 .Sy Committed .
 .
 .Sh SEE ALSO
 .Xr attr 1 ,
 .Xr gzip 1 ,
 .Xr ssh 1 ,
 .Xr chmod 2 ,
 .Xr fsync 2 ,
 .Xr stat 2 ,
 .Xr write 2 ,
 .Xr acl 5 ,
 .Xr attributes 5 ,
 .Xr exports 5 ,
 .Xr zfsconcepts 7 ,
 .Xr zfsprops 7 ,
 .Xr exportfs 8 ,
 .Xr mount 8 ,
 .Xr net 8 ,
 .Xr selinux 8 ,
 .Xr zfs-allow 8 ,
 .Xr zfs-bookmark 8 ,
 .Xr zfs-change-key 8 ,
 .Xr zfs-clone 8 ,
 .Xr zfs-create 8 ,
 .Xr zfs-destroy 8 ,
 .Xr zfs-diff 8 ,
 .Xr zfs-get 8 ,
 .Xr zfs-groupspace 8 ,
 .Xr zfs-hold 8 ,
 .Xr zfs-inherit 8 ,
 .Xr zfs-jail 8 ,
 .Xr zfs-list 8 ,
 .Xr zfs-load-key 8 ,
 .Xr zfs-mount 8 ,
 .Xr zfs-program 8 ,
 .Xr zfs-project 8 ,
 .Xr zfs-projectspace 8 ,
 .Xr zfs-promote 8 ,
 .Xr zfs-receive 8 ,
 .Xr zfs-redact 8 ,
 .Xr zfs-release 8 ,
 .Xr zfs-rename 8 ,
 .Xr zfs-rollback 8 ,
 .Xr zfs-send 8 ,
 .Xr zfs-set 8 ,
 .Xr zfs-share 8 ,
 .Xr zfs-snapshot 8 ,
 .Xr zfs-unallow 8 ,
 .Xr zfs-unjail 8 ,
 .Xr zfs-unload-key 8 ,
 .Xr zfs-unmount 8 ,
 .Xr zfs-unshare 8 ,
 .Xr zfs-upgrade 8 ,
 .Xr zfs-userspace 8 ,
 .Xr zfs-wait 8 ,
 .Xr zpool 8
diff --git a/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8 b/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8
index eef0ce68f17b..465e336d170c 100644
--- a/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8
+++ b/sys/contrib/openzfs/man/man8/zfs_ids_to_path.8
@@ -1,52 +1,52 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2020 by Delphix. All rights reserved.
 .\"
-.Dd April 17, 2020
+.Dd July 11, 2022
 .Dt ZFS_IDS_TO_PATH 8
 .Os
 .
 .Sh NAME
 .Nm zfs_ids_to_path
 .Nd convert objset and object ids to names and paths
 .Sh SYNOPSIS
 .Nm
 .Op Fl v
 .Ar pool
 .Ar objset-id
 .Ar object-id
 .
 .Sh DESCRIPTION
 The
 .Sy zfs_ids_to_path
 utility converts a provided objset and object ids
 into a path to the file they refer to.
 .Bl -tag -width "-D"
 .It Fl v
 Verbose.
 Print the dataset name and the file path within the dataset separately.
 This will work correctly even if the dataset is not mounted.
 .El
 .
 .Sh SEE ALSO
 .Xr zdb 8 ,
 .Xr zfs 8
diff --git a/sys/contrib/openzfs/man/man8/zgenhostid.8 b/sys/contrib/openzfs/man/man8/zgenhostid.8
index 2b5b4fc18216..ff564880f97d 100644
--- a/sys/contrib/openzfs/man/man8/zgenhostid.8
+++ b/sys/contrib/openzfs/man/man8/zgenhostid.8
@@ -1,101 +1,101 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
 .\"
-.Dd May 26, 2021
+.Dd July 11, 2022
 .Dt ZGENHOSTID 8
 .Os
 .
 .Sh NAME
 .Nm zgenhostid
 .Nd generate host ID into /etc/hostid
 .Sh SYNOPSIS
 .Nm
 .Op Fl f
 .Op Fl o Ar filename
 .Op Ar hostid
 .
 .Sh DESCRIPTION
 Creates
 .Pa /etc/hostid
 file and stores the host ID in it.
 If
 .Ar hostid
 was provided, validate and store that value.
 Otherwise, randomly generate an ID.
 .
 .Sh OPTIONS
 .Bl -tag -width "-o filename"
 .It Fl h
 Display a summary of the command-line options.
 .It Fl f
 Allow output overwrite.
 .It Fl o Ar filename
 Write to
 .Pa filename
 instead of the default
 .Pa /etc/hostid .
 .It Ar hostid
 Specifies the value to be placed in
 .Pa /etc/hostid .
 It should be a number with a value between 1 and 2^32-1.
 If
 .Sy 0 ,
 generate a random ID.
 This value
 .Em must
 be unique among your systems.
 It
 .Em must
 be an 8-digit-long hexadecimal number, optionally prefixed by
 .Qq 0x .
 .El
 .
 .Sh FILES
 .Pa /etc/hostid
 .
 .Sh EXAMPLES
 .Bl -tag -width Bd
 .It Generate a random hostid and store it
 .Dl # Nm
 .It Record the libc-generated hostid in Pa /etc/hostid
 .Dl # Nm Qq $ Ns Pq Nm hostid
 .It Record a custom hostid Po Ar 0xdeadbeef Pc in Pa /etc/hostid
 .Dl # Nm Ar deadbeef
 .It Record a custom hostid Po Ar 0x01234567 Pc in Pa /tmp/hostid No and overwrite the file if it exists
 .Dl # Nm Fl f o Ar /tmp/hostid 0x01234567
 .El
 .
 .Sh SEE ALSO
 .Xr genhostid 1 ,
 .Xr hostid 1 ,
 .Xr sethostid 3 ,
 .Xr spl 4
 .
 .Sh HISTORY
 .Nm
 emulates the
 .Xr genhostid 1
 utility and is provided for use on systems which
 do not include the utility or do not provide the
 .Xr sethostid 3
 function.
diff --git a/sys/contrib/openzfs/man/man8/zpool-attach.8 b/sys/contrib/openzfs/man/man8/zpool-attach.8
index 51d876767666..f120350a5190 100644
--- a/sys/contrib/openzfs/man/man8/zpool-attach.8
+++ b/sys/contrib/openzfs/man/man8/zpool-attach.8
@@ -1,142 +1,142 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd June 28, 2023
+.Dd November 8, 2023
 .Dt ZPOOL-ATTACH 8
 .Os
 .
 .Sh NAME
 .Nm zpool-attach
 .Nd attach new device to existing ZFS vdev
 .Sh SYNOPSIS
 .Nm zpool
 .Cm attach
 .Op Fl fsw
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool device new_device
 .
 .Sh DESCRIPTION
 Attaches
 .Ar new_device
 to the existing
 .Ar device .
 The behavior differs depending on if the existing
 .Ar device
 is a RAID-Z device, or a mirror/plain device.
 .Pp
 If the existing device is a mirror or plain device
 .Pq e.g. specified as Qo Li sda Qc or Qq Li mirror-7 ,
 the new device will be mirrored with the existing device, a resilver will be
 initiated, and the new device will contribute to additional redundancy once the
 resilver completes.
 If
 .Ar device
 is not currently part of a mirrored configuration,
 .Ar device
 automatically transforms into a two-way mirror of
 .Ar device
 and
 .Ar new_device .
 If
 .Ar device
 is part of a two-way mirror, attaching
 .Ar new_device
 creates a three-way mirror, and so on.
 In either case,
 .Ar new_device
 begins to resilver immediately and any running scrub is canceled.
 .Pp
 If the existing device is a RAID-Z device
 .Pq e.g. specified as Qq Ar raidz2-0 ,
 the new device will become part of that RAID-Z group.
 A "raidz expansion" will be initiated, and once the expansion completes,
 the new device will contribute additional space to the RAID-Z group.
 The expansion entails reading all allocated space from existing disks in the
 RAID-Z group, and rewriting it to the new disks in the RAID-Z group (including
 the newly added
 .Ar device ) .
 Its progress can be monitored with
 .Nm zpool Cm status .
 .Pp
 Data redundancy is maintained during and after the expansion.
 If a disk fails while the expansion is in progress, the expansion pauses until
 the health of the RAID-Z vdev is restored (e.g. by replacing the failed disk
 and waiting for reconstruction to complete).
 Expansion does not change the number of failures that can be tolerated
 without data loss (e.g. a RAID-Z2 is still a RAID-Z2 even after expansion).
 A RAID-Z vdev can be expanded multiple times.
 .Pp
 After the expansion completes, old blocks retain their old data-to-parity
 ratio
 .Pq e.g. 5-wide RAID-Z2 has 3 data and 2 parity
 but distributed among the larger set of disks.
 New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide
 RAID-Z2 which has been expanded once to 6-wide, has 4 data and 2 parity).
 However, the vdev's assumed parity ratio does not change, so slightly less
 space than is expected may be reported for newly-written blocks, according to
 .Nm zfs Cm list ,
 .Nm df ,
 .Nm ls Fl s ,
 and similar tools.
 .Pp
 A pool-wide scrub is initiated at the end of the expansion in order to verify
 the checksums of all blocks which have been copied during the expansion.
 .Bl -tag -width Ds
 .It Fl f
 Forces use of
 .Ar new_device ,
 even if it appears to be in use.
 Not all devices can be overridden in this manner.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the given pool properties.
 See the
 .Xr zpoolprops 7
 manual page for a list of valid properties that can be set.
 The only property supported at the moment is
 .Sy ashift .
 .It Fl s
 When attaching to a mirror or plain device, the
 .Ar new_device
 is reconstructed sequentially to restore redundancy as quickly as possible.
 Checksums are not verified during sequential reconstruction so a scrub is
 started when the resilver completes.
 .It Fl w
 Waits until
 .Ar new_device
 has finished resilvering or expanding before returning.
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-add 8 ,
 .Xr zpool-detach 8 ,
 .Xr zpool-import 8 ,
 .Xr zpool-initialize 8 ,
 .Xr zpool-online 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-resilver 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-checkpoint.8 b/sys/contrib/openzfs/man/man8/zpool-checkpoint.8
index d97d10d5df6e..b654f669cfa2 100644
--- a/sys/contrib/openzfs/man/man8/zpool-checkpoint.8
+++ b/sys/contrib/openzfs/man/man8/zpool-checkpoint.8
@@ -1,73 +1,73 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2022
 .Dt ZPOOL-CHECKPOINT 8
 .Os
 .
 .Sh NAME
 .Nm zpool-checkpoint
 .Nd check-point current ZFS storage pool state
 .Sh SYNOPSIS
 .Nm zpool
 .Cm checkpoint
 .Op Fl d Op Fl w
 .Ar pool
 .
 .Sh DESCRIPTION
 Checkpoints the current state of
 .Ar pool
 , which can be later restored by
 .Nm zpool Cm import --rewind-to-checkpoint .
 The existence of a checkpoint in a pool prohibits the following
 .Nm zpool
 subcommands:
 .Cm remove , attach , detach , split , No and Cm reguid .
 In addition, it may break reservation boundaries if the pool lacks free
 space.
 The
 .Nm zpool Cm status
 command indicates the existence of a checkpoint or the progress of discarding a
 checkpoint from a pool.
 .Nm zpool Cm list
 can be used to check how much space the checkpoint takes from the pool.
 .
 .Sh OPTIONS
 .Bl -tag -width Ds
 .It Fl d , -discard
 Discards an existing checkpoint from
 .Ar pool .
 .It Fl w , -wait
 Waits until the checkpoint has finished being discarded before returning.
 .El
 .
 .Sh SEE ALSO
 .Xr zfs-snapshot 8 ,
 .Xr zpool-import 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-clear.8 b/sys/contrib/openzfs/man/man8/zpool-clear.8
index 19cd4be36408..70cd8325bd0e 100644
--- a/sys/contrib/openzfs/man/man8/zpool-clear.8
+++ b/sys/contrib/openzfs/man/man8/zpool-clear.8
@@ -1,72 +1,72 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd April 29, 2024
 .Dt ZPOOL-CLEAR 8
 .Os
 .
 .Sh NAME
 .Nm zpool-clear
 .Nd clear device errors in ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm clear
 .Op Fl -power
 .Ar pool
 .Oo Ar device Oc Ns …
 .
 .Sh DESCRIPTION
 Clears device errors in a pool.
 If no arguments are specified, all device errors within the pool are cleared.
 If one or more devices is specified, only those errors associated with the
 specified device or devices are cleared.
 .Pp
 If the pool was suspended it will be brought back online provided the
 devices can be accessed.
 Pools with
 .Sy multihost
 enabled which have been suspended cannot be resumed when there is evidence
 that the pool was imported by another host.
 The same checks performed during an import will be applied before the clear
 proceeds.
 .Bl -tag -width Ds
 .It Fl -power
 Power on the devices's slot in the storage enclosure and wait for the device
 to show up before attempting to clear errors.
 This is done on all the devices specified.
 Alternatively, you can set the
 .Sy ZPOOL_AUTO_POWER_ON_SLOT
 environment variable to always enable this behavior.
 Note: This flag currently works on Linux only.
 .El
 .
 .Sh SEE ALSO
 .Xr zdb 8 ,
 .Xr zpool-reopen 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-create.8 b/sys/contrib/openzfs/man/man8/zpool-create.8
index 490c67629a20..a36ae260a158 100644
--- a/sys/contrib/openzfs/man/man8/zpool-create.8
+++ b/sys/contrib/openzfs/man/man8/zpool-create.8
@@ -1,245 +1,245 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZPOOL-CREATE 8
 .Os
 .
 .Sh NAME
 .Nm zpool-create
 .Nd create ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm create
 .Op Fl dfn
 .Op Fl m Ar mountpoint
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Oo Fl o Sy feature@ Ns Ar feature Ns = Ns Ar value Oc
 .Op Fl o Ar compatibility Ns = Ns Sy off Ns | Ns Sy legacy Ns | Ns Ar file Ns Oo , Ns Ar file Oc Ns …
 .Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns …
 .Op Fl R Ar root
 .Op Fl t Ar tname
 .Ar pool
 .Ar vdev Ns …
 .
 .Sh DESCRIPTION
 Creates a new storage pool containing the virtual devices specified on the
 command line.
 The pool name must begin with a letter, and can only contain
 alphanumeric characters as well as the underscore
 .Pq Qq Sy _ ,
 dash
 .Pq Qq Sy \&- ,
 colon
 .Pq Qq Sy \&: ,
 space
 .Pq Qq Sy \&\  ,
 and period
 .Pq Qq Sy \&. .
 The pool names
 .Sy mirror ,
 .Sy raidz ,
 .Sy draid ,
 .Sy spare
 and
 .Sy log
 are reserved, as are names beginning with
 .Sy mirror ,
 .Sy raidz ,
 .Sy draid ,
 and
 .Sy spare .
 The
 .Ar vdev
 specification is described in the
 .Sx Virtual Devices
 section of
 .Xr zpoolconcepts 7 .
 .Pp
 The command attempts to verify that each device specified is accessible and not
 currently in use by another subsystem.
 However this check is not robust enough
 to detect simultaneous attempts to use a new device in different pools, even if
 .Sy multihost Ns = Sy enabled .
 The administrator must ensure that simultaneous invocations of any combination
 of
 .Nm zpool Cm replace ,
 .Nm zpool Cm create ,
 .Nm zpool Cm add ,
 or
 .Nm zpool Cm labelclear
 do not refer to the same device.
 Using the same device in two pools will result in pool corruption.
 .Pp
 There are some uses, such as being currently mounted, or specified as the
 dedicated dump device, that prevents a device from ever being used by ZFS.
 Other uses, such as having a preexisting UFS file system, can be overridden with
 .Fl f .
 .Pp
 The command also checks that the replication strategy for the pool is
 consistent.
 An attempt to combine redundant and non-redundant storage in a single pool,
 or to mix disks and files, results in an error unless
 .Fl f
 is specified.
 The use of differently-sized devices within a single raidz or mirror group is
 also flagged as an error unless
 .Fl f
 is specified.
 .Pp
 Unless the
 .Fl R
 option is specified, the default mount point is
 .Pa / Ns Ar pool .
 The mount point must not exist or must be empty, or else the root dataset
 will not be able to be be mounted.
 This can be overridden with the
 .Fl m
 option.
 .Pp
 By default all supported features are enabled on the new pool.
 The
 .Fl d
 option and the
 .Fl o Ar compatibility
 property
 .Pq e.g Fl o Sy compatibility Ns = Ns Ar 2020
 can be used to restrict the features that are enabled, so that the
 pool can be imported on other releases of ZFS.
 .Bl -tag -width "-t tname"
 .It Fl d
 Do not enable any features on the new pool.
 Individual features can be enabled by setting their corresponding properties to
 .Sy enabled
 with
 .Fl o .
 See
 .Xr zpool-features 7
 for details about feature properties.
 .It Fl f
 Forces use of
 .Ar vdev Ns s ,
 even if they appear in use or specify a conflicting replication level.
 Not all devices can be overridden in this manner.
 .It Fl m Ar mountpoint
 Sets the mount point for the root dataset.
 The default mount point is
 .Pa /pool
 or
 .Pa altroot/pool
 if
 .Sy altroot
 is specified.
 The mount point must be an absolute path,
 .Sy legacy ,
 or
 .Sy none .
 For more information on dataset mount points, see
 .Xr zfsprops 7 .
 .It Fl n
 Displays the configuration that would be used without actually creating the
 pool.
 The actual pool creation can still fail due to insufficient privileges or
 device sharing.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the given pool properties.
 See
 .Xr zpoolprops 7
 for a list of valid properties that can be set.
 .It Fl o Ar compatibility Ns = Ns Sy off Ns | Ns Sy legacy Ns | Ns Ar file Ns Oo , Ns Ar file Oc Ns …
 Specifies compatibility feature sets.
 See
 .Xr zpool-features 7
 for more information about compatibility feature sets.
 .It Fl o Sy feature@ Ns Ar feature Ns = Ns Ar value
 Sets the given pool feature.
 See the
 .Xr zpool-features 7
 section for a list of valid features that can be set.
 Value can be either disabled or enabled.
 .It Fl O Ar file-system-property Ns = Ns Ar value
 Sets the given file system properties in the root file system of the pool.
 See
 .Xr zfsprops 7
 for a list of valid properties that can be set.
 .It Fl R Ar root
 Equivalent to
 .Fl o Sy cachefile Ns = Ns Sy none Fl o Sy altroot Ns = Ns Ar root
 .It Fl t Ar tname
 Sets the in-core pool name to
 .Ar tname
 while the on-disk name will be the name specified as
 .Ar pool .
 This will set the default of the
 .Sy cachefile
 property to
 .Sy none .
 This is intended
 to handle name space collisions when creating pools for other systems,
 such as virtual machines or physical machines whose pools live on network
 block devices.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 1, 2, 3, 4, 11, 12 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Creating a RAID-Z Storage Pool
 The following command creates a pool with a single raidz root vdev that
 consists of six disks:
 .Dl # Nm zpool Cm create Ar tank Sy raidz Pa sda sdb sdc sdd sde sdf
 .
 .Ss Example 2 : No Creating a Mirrored Storage Pool
 The following command creates a pool with two mirrors, where each mirror
 contains two disks:
 .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy mirror Pa sdc sdd
 .
 .Ss Example 3 : No Creating a ZFS Storage Pool by Using Partitions
 The following command creates a non-redundant pool using two disk partitions:
 .Dl # Nm zpool Cm create Ar tank Pa sda1 sdb2
 .
 .Ss Example 4 : No Creating a ZFS Storage Pool by Using Files
 The following command creates a non-redundant pool using files.
 While not recommended, a pool based on files can be useful for experimental
 purposes.
 .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b
 .
 .Ss Example 5 : No Managing Hot Spares
 The following command creates a new pool with an available hot spare:
 .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy spare Pa sdc
 .
 .Ss Example 6 : No Creating a ZFS Pool with Mirrored Separate Intent Logs
 The following command creates a ZFS storage pool consisting of two, two-way
 mirrors and mirrored log devices:
 .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf
 .
 .Sh SEE ALSO
 .Xr zpool-destroy 8 ,
 .Xr zpool-export 8 ,
 .Xr zpool-import 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-destroy.8 b/sys/contrib/openzfs/man/man8/zpool-destroy.8
index f49f29804ad7..82f3f3e203d6 100644
--- a/sys/contrib/openzfs/man/man8/zpool-destroy.8
+++ b/sys/contrib/openzfs/man/man8/zpool-destroy.8
@@ -1,58 +1,58 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZPOOL-DESTROY 8
 .Os
 .
 .Sh NAME
 .Nm zpool-destroy
 .Nd destroy ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm destroy
 .Op Fl f
 .Ar pool
 .
 .Sh DESCRIPTION
 Destroys the given pool, freeing up any devices for other use.
 This command tries to unmount any active datasets before destroying the pool.
 .Bl -tag -width Ds
 .It Fl f
 Forcefully unmount all active datasets.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 7 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Destroying a ZFS Storage Pool
 The following command destroys the pool
 .Ar tank
 and any datasets contained within:
 .Dl # Nm zpool Cm destroy Fl f Ar tank
diff --git a/sys/contrib/openzfs/man/man8/zpool-detach.8 b/sys/contrib/openzfs/man/man8/zpool-detach.8
index ae02dbc2d5b8..79a44310110d 100644
--- a/sys/contrib/openzfs/man/man8/zpool-detach.8
+++ b/sys/contrib/openzfs/man/man8/zpool-detach.8
@@ -1,59 +1,59 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd July 11, 2022
 .Dt ZPOOL-DETACH 8
 .Os
 .
 .Sh NAME
 .Nm zpool-detach
 .Nd detach device from ZFS mirror
 .Sh SYNOPSIS
 .Nm zpool
 .Cm detach
 .Ar pool device
 .
 .Sh DESCRIPTION
 Detaches
 .Ar device
 from a mirror.
 The operation is refused if there are no other valid replicas of the data.
 If
 .Ar device
 may be re-added to the pool later on then consider the
 .Nm zpool Cm offline
 command instead.
 .
 .Sh SEE ALSO
 .Xr zpool-attach 8 ,
 .Xr zpool-labelclear 8 ,
 .Xr zpool-offline 8 ,
 .Xr zpool-remove 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-split 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-export.8 b/sys/contrib/openzfs/man/man8/zpool-export.8
index 171a7541c6d2..02495c088f94 100644
--- a/sys/contrib/openzfs/man/man8/zpool-export.8
+++ b/sys/contrib/openzfs/man/man8/zpool-export.8
@@ -1,83 +1,83 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZPOOL-EXPORT 8
 .Os
 .
 .Sh NAME
 .Nm zpool-export
 .Nd export ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm export
 .Op Fl f
 .Fl a Ns | Ns Ar pool Ns …
 .
 .Sh DESCRIPTION
 Exports the given pools from the system.
 All devices are marked as exported, but are still considered in use by other
 subsystems.
 The devices can be moved between systems
 .Pq even those of different endianness
 and imported as long as a sufficient number of devices are present.
 .Pp
 Before exporting the pool, all datasets within the pool are unmounted.
 A pool can not be exported if it has a shared spare that is currently being
 used.
 .Pp
 For pools to be portable, you must give the
 .Nm zpool
 command whole disks, not just partitions, so that ZFS can label the disks with
 portable EFI labels.
 Otherwise, disk drivers on platforms of different endianness will not recognize
 the disks.
 .Bl -tag -width Ds
 .It Fl a
 Exports all pools imported on the system.
 .It Fl f
 Forcefully unmount all datasets, and allow export of pools with active shared
 spares.
 .Pp
 This command will forcefully export the pool even if it has a shared spare that
 is currently being used.
 This may lead to potential data corruption.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 8 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Exporting a ZFS Storage Pool
 The following command exports the devices in pool
 .Ar tank
 so that they can be relocated or later imported:
 .Dl # Nm zpool Cm export Ar tank
 .
 .Sh SEE ALSO
 .Xr zpool-import 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-get.8 b/sys/contrib/openzfs/man/man8/zpool-get.8
index 1d6d1f08afa6..bfe1bae7619f 100644
--- a/sys/contrib/openzfs/man/man8/zpool-get.8
+++ b/sys/contrib/openzfs/man/man8/zpool-get.8
@@ -1,205 +1,205 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd October 12, 2024
 .Dt ZPOOL-GET 8
 .Os
 .
 .Sh NAME
 .Nm zpool-get
 .Nd retrieve properties of ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm get
 .Op Fl Hp
 .Op Fl j Op Ar --json-int, --json-pool-key-guid
 .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns …
 .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns …
 .Oo Ar pool Oc Ns …
 .
 .Nm zpool
 .Cm get
 .Op Fl Hp
 .Op Fl j Op Ar --json-int
 .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns …
 .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns …
 .Ar pool
 .Oo Sy all-vdevs Ns | Ns
 .Ar vdev Oc Ns …
 .
 .Nm zpool
 .Cm set
 .Ar property Ns = Ns Ar value
 .Ar pool
 .
 .Nm zpool
 .Cm set
 .Ar property Ns = Ns Ar value
 .Ar pool
 .Ar vdev
 .
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Xo
 .Nm zpool
 .Cm get
 .Op Fl Hp
 .Op Fl j Op Ar --json-int, --json-pool-key-guid
 .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns …
 .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns …
 .Oo Ar pool Oc Ns …
 .Xc
 Retrieves the given list of properties
 .Po
 or all properties if
 .Sy all
 is used
 .Pc
 for the specified storage pool(s).
 These properties are displayed with the following fields:
 .Bl -tag -compact -offset Ds -width "property"
 .It Sy name
 Name of storage pool.
 .It Sy property
 Property name.
 .It Sy value
 Property value.
 .It Sy source
 Property source, either
 .Sy default No or Sy local .
 .El
 .Pp
 See the
 .Xr zpoolprops 7
 manual page for more information on the available pool properties.
 .Bl -tag -compact -offset Ds -width "-o field"
 .It Fl j , -json Op Ar --json-int, --json-pool-key-guid
 Display the list of properties in JSON format.
 Specify
 .Sy --json-int
 to display the numbers in integer format instead of strings in JSON output.
 Specify
 .Sy --json-pool-key-guid
 to set pool GUID as key for pool objects instead of pool name.
 .It Fl H
 Scripted mode.
 Do not display headers, and separate fields by a single tab instead of arbitrary
 space.
 .It Fl o Ar field
 A comma-separated list of columns to display, defaults to
 .Sy name , Ns Sy property , Ns Sy value , Ns Sy source .
 .It Fl p
 Display numbers in parsable (exact) values.
 .El
 .It Xo
 .Nm zpool
 .Cm get
 .Op Fl j Op Ar --json-int
 .Op Fl Hp
 .Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns …
 .Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns …
 .Ar pool
 .Oo Sy all-vdevs Ns | Ns
 .Ar vdev Oc Ns …
 .Xc
 Retrieves the given list of properties
 .Po
 or all properties if
 .Sy all
 is used
 .Pc
 for the specified vdevs
 .Po
 or all vdevs if
 .Sy all-vdevs
 is used
 .Pc
 in the specified pool.
 These properties are displayed with the following fields:
 .Bl -tag -compact -offset Ds -width "property"
 .It Sy name
 Name of vdev.
 .It Sy property
 Property name.
 .It Sy value
 Property value.
 .It Sy source
 Property source, either
 .Sy default No or Sy local .
 .El
 .Pp
 See the
 .Xr vdevprops 7
 manual page for more information on the available pool properties.
 .Bl -tag -compact -offset Ds -width "-o field"
 .It Fl j , -json Op Ar --json-int
 Display the list of properties in JSON format.
 Specify
 .Sy --json-int
 to display the numbers in integer format instead of strings in JSON output.
 .It Fl H
 Scripted mode.
 Do not display headers, and separate fields by a single tab instead of arbitrary
 space.
 .It Fl o Ar field
 A comma-separated list of columns to display, defaults to
 .Sy name , Ns Sy property , Ns Sy value , Ns Sy source .
 .It Fl p
 Display numbers in parsable (exact) values.
 .El
 .It Xo
 .Nm zpool
 .Cm set
 .Ar property Ns = Ns Ar value
 .Ar pool
 .Xc
 Sets the given property on the specified pool.
 See the
 .Xr zpoolprops 7
 manual page for more information on what properties can be set and acceptable
 values.
 .It Xo
 .Nm zpool
 .Cm set
 .Ar property Ns = Ns Ar value
 .Ar pool
 .Ar vdev
 .Xc
 Sets the given property on the specified vdev in the specified pool.
 See the
 .Xr vdevprops 7
 manual page for more information on what properties can be set and acceptable
 values.
 .El
 .
 .Sh SEE ALSO
 .Xr vdevprops 7 ,
 .Xr zpool-features 7 ,
 .Xr zpoolprops 7 ,
 .Xr zpool-list 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-history.8 b/sys/contrib/openzfs/man/man8/zpool-history.8
index f15086eabc47..f02168951ff2 100644
--- a/sys/contrib/openzfs/man/man8/zpool-history.8
+++ b/sys/contrib/openzfs/man/man8/zpool-history.8
@@ -1,59 +1,59 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd July 11, 2022
 .Dt ZPOOL-HISTORY 8
 .Os
 .
 .Sh NAME
 .Nm zpool-history
 .Nd inspect command history of ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm history
 .Op Fl il
 .Oo Ar pool Oc Ns …
 .
 .Sh DESCRIPTION
 Displays the command history of the specified pool(s) or all pools if no pool is
 specified.
 .Bl -tag -width Ds
 .It Fl i
 Displays internally logged ZFS events in addition to user initiated events.
 .It Fl l
 Displays log records in long format, which in addition to standard format
 includes, the user name, the hostname, and the zone in which the operation was
 performed.
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-checkpoint 8 ,
 .Xr zpool-events 8 ,
 .Xr zpool-status 8 ,
 .Xr zpool-wait 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-import.8 b/sys/contrib/openzfs/man/man8/zpool-import.8
index 9076f5c34929..c6d5f222b6b2 100644
--- a/sys/contrib/openzfs/man/man8/zpool-import.8
+++ b/sys/contrib/openzfs/man/man8/zpool-import.8
@@ -1,436 +1,436 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZPOOL-IMPORT 8
 .Os
 .
 .Sh NAME
 .Nm zpool-import
 .Nd import ZFS storage pools or list available pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm import
 .Op Fl D
 .Oo Fl d Ar dir Ns | Ns Ar device Oc Ns …
 .Nm zpool
 .Cm import
 .Fl a
 .Op Fl DflmN
 .Op Fl F Op Fl nTX
 .Op Fl -rewind-to-checkpoint
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Op Fl R Ar root
 .Nm zpool
 .Cm import
 .Op Fl Dflmt
 .Op Fl F Op Fl nTX
 .Op Fl -rewind-to-checkpoint
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Op Fl R Ar root
 .Op Fl s
 .Ar pool Ns | Ns Ar id
 .Op Ar newpool
 .
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Xo
 .Nm zpool
 .Cm import
 .Op Fl D
 .Oo Fl d Ar dir Ns | Ns Ar device Oc Ns …
 .Xc
 Lists pools available to import.
 If the
 .Fl d
 or
 .Fl c
 options are not specified, this command searches for devices using libblkid
 on Linux and geom on
 .Fx .
 The
 .Fl d
 option can be specified multiple times, and all directories are searched.
 If the device appears to be part of an exported pool, this command displays a
 summary of the pool with the name of the pool, a numeric identifier, as well as
 the vdev layout and current health of the device for each device or file.
 Destroyed pools, pools that were previously destroyed with the
 .Nm zpool Cm destroy
 command, are not listed unless the
 .Fl D
 option is specified.
 .Pp
 The numeric identifier is unique, and can be used instead of the pool name when
 multiple exported pools of the same name are available.
 .Bl -tag -width Ds
 .It Fl c Ar cachefile
 Reads configuration from the given
 .Ar cachefile
 that was created with the
 .Sy cachefile
 pool property.
 This
 .Ar cachefile
 is used instead of searching for devices.
 .It Fl d Ar dir Ns | Ns Ar device
 Uses
 .Ar device
 or searches for devices or files in
 .Ar dir .
 The
 .Fl d
 option can be specified multiple times.
 .It Fl D
 Lists destroyed pools only.
 .El
 .It Xo
 .Nm zpool
 .Cm import
 .Fl a
 .Op Fl DflmN
 .Op Fl F Op Fl nTX
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Op Fl R Ar root
 .Op Fl s
 .Xc
 Imports all pools found in the search directories.
 Identical to the previous command, except that all pools with a sufficient
 number of devices available are imported.
 Destroyed pools, pools that were previously destroyed with the
 .Nm zpool Cm destroy
 command, will not be imported unless the
 .Fl D
 option is specified.
 .Bl -tag -width Ds
 .It Fl a
 Searches for and imports all pools found.
 .It Fl c Ar cachefile
 Reads configuration from the given
 .Ar cachefile
 that was created with the
 .Sy cachefile
 pool property.
 This
 .Ar cachefile
 is used instead of searching for devices.
 .It Fl d Ar dir Ns | Ns Ar device
 Uses
 .Ar device
 or searches for devices or files in
 .Ar dir .
 The
 .Fl d
 option can be specified multiple times.
 This option is incompatible with the
 .Fl c
 option.
 .It Fl D
 Imports destroyed pools only.
 The
 .Fl f
 option is also required.
 .It Fl f
 Forces import, even if the pool appears to be potentially active.
 .It Fl F
 Recovery mode for a non-importable pool.
 Attempt to return the pool to an importable state by discarding the last few
 transactions.
 Not all damaged pools can be recovered by using this option.
 If successful, the data from the discarded transactions is irretrievably lost.
 This option is ignored if the pool is importable or already imported.
 .It Fl l
 Indicates that this command will request encryption keys for all encrypted
 datasets it attempts to mount as it is bringing the pool online.
 Note that if any datasets have a
 .Sy keylocation
 of
 .Sy prompt
 this command will block waiting for the keys to be entered.
 Without this flag
 encrypted datasets will be left unavailable until the keys are loaded.
 .It Fl m
 Allows a pool to import when there is a missing log device.
 Recent transactions can be lost because the log device will be discarded.
 .It Fl n
 Used with the
 .Fl F
 recovery option.
 Determines whether a non-importable pool can be made importable again, but does
 not actually perform the pool recovery.
 For more details about pool recovery mode, see the
 .Fl F
 option, above.
 .It Fl N
 Import the pool without mounting any file systems.
 .It Fl o Ar mntopts
 Comma-separated list of mount options to use when mounting datasets within the
 pool.
 See
 .Xr zfs 8
 for a description of dataset properties and mount options.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the specified property on the imported pool.
 See the
 .Xr zpoolprops 7
 manual page for more information on the available pool properties.
 .It Fl R Ar root
 Sets the
 .Sy cachefile
 property to
 .Sy none
 and the
 .Sy altroot
 property to
 .Ar root .
 .It Fl -rewind-to-checkpoint
 Rewinds pool to the checkpointed state.
 Once the pool is imported with this flag there is no way to undo the rewind.
 All changes and data that were written after the checkpoint are lost!
 The only exception is when the
 .Sy readonly
 mounting option is enabled.
 In this case, the checkpointed state of the pool is opened and an
 administrator can see how the pool would look like if they were
 to fully rewind.
 .It Fl s
 Scan using the default search path, the libblkid cache will not be
 consulted.
 A custom search path may be specified by setting the
 .Sy ZPOOL_IMPORT_PATH
 environment variable.
 .It Fl X
 Used with the
 .Fl F
 recovery option.
 Determines whether extreme measures to find a valid txg should take place.
 This allows the pool to
 be rolled back to a txg which is no longer guaranteed to be consistent.
 Pools imported at an inconsistent txg may contain uncorrectable checksum errors.
 For more details about pool recovery mode, see the
 .Fl F
 option, above.
 WARNING: This option can be extremely hazardous to the
 health of your pool and should only be used as a last resort.
 .It Fl T
 Specify the txg to use for rollback.
 Implies
 .Fl FX .
 For more details
 about pool recovery mode, see the
 .Fl X
 option, above.
 WARNING: This option can be extremely hazardous to the
 health of your pool and should only be used as a last resort.
 .El
 .It Xo
 .Nm zpool
 .Cm import
 .Op Fl Dflmt
 .Op Fl F Op Fl nTX
 .Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns Ar device
 .Op Fl o Ar mntopts
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Op Fl R Ar root
 .Op Fl s
 .Ar pool Ns | Ns Ar id
 .Op Ar newpool
 .Xc
 Imports a specific pool.
 A pool can be identified by its name or the numeric identifier.
 If
 .Ar newpool
 is specified, the pool is imported using the name
 .Ar newpool .
 Otherwise, it is imported with the same name as its exported name.
 .Pp
 If a device is removed from a system without running
 .Nm zpool Cm export
 first, the device appears as potentially active.
 It cannot be determined if this was a failed export, or whether the device is
 really in use from another host.
 To import a pool in this state, the
 .Fl f
 option is required.
 .Bl -tag -width Ds
 .It Fl c Ar cachefile
 Reads configuration from the given
 .Ar cachefile
 that was created with the
 .Sy cachefile
 pool property.
 This
 .Ar cachefile
 is used instead of searching for devices.
 .It Fl d Ar dir Ns | Ns Ar device
 Uses
 .Ar device
 or searches for devices or files in
 .Ar dir .
 The
 .Fl d
 option can be specified multiple times.
 This option is incompatible with the
 .Fl c
 option.
 .It Fl D
 Imports destroyed pool.
 The
 .Fl f
 option is also required.
 .It Fl f
 Forces import, even if the pool appears to be potentially active.
 .It Fl F
 Recovery mode for a non-importable pool.
 Attempt to return the pool to an importable state by discarding the last few
 transactions.
 Not all damaged pools can be recovered by using this option.
 If successful, the data from the discarded transactions is irretrievably lost.
 This option is ignored if the pool is importable or already imported.
 .It Fl l
 Indicates that this command will request encryption keys for all encrypted
 datasets it attempts to mount as it is bringing the pool online.
 Note that if any datasets have a
 .Sy keylocation
 of
 .Sy prompt
 this command will block waiting for the keys to be entered.
 Without this flag
 encrypted datasets will be left unavailable until the keys are loaded.
 .It Fl m
 Allows a pool to import when there is a missing log device.
 Recent transactions can be lost because the log device will be discarded.
 .It Fl n
 Used with the
 .Fl F
 recovery option.
 Determines whether a non-importable pool can be made importable again, but does
 not actually perform the pool recovery.
 For more details about pool recovery mode, see the
 .Fl F
 option, above.
 .It Fl o Ar mntopts
 Comma-separated list of mount options to use when mounting datasets within the
 pool.
 See
 .Xr zfs 8
 for a description of dataset properties and mount options.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the specified property on the imported pool.
 See the
 .Xr zpoolprops 7
 manual page for more information on the available pool properties.
 .It Fl R Ar root
 Sets the
 .Sy cachefile
 property to
 .Sy none
 and the
 .Sy altroot
 property to
 .Ar root .
 .It Fl s
 Scan using the default search path, the libblkid cache will not be
 consulted.
 A custom search path may be specified by setting the
 .Sy ZPOOL_IMPORT_PATH
 environment variable.
 .It Fl X
 Used with the
 .Fl F
 recovery option.
 Determines whether extreme measures to find a valid txg should take place.
 This allows the pool to
 be rolled back to a txg which is no longer guaranteed to be consistent.
 Pools imported at an inconsistent txg may contain uncorrectable
 checksum errors.
 For more details about pool recovery mode, see the
 .Fl F
 option, above.
 WARNING: This option can be extremely hazardous to the
 health of your pool and should only be used as a last resort.
 .It Fl T
 Specify the txg to use for rollback.
 Implies
 .Fl FX .
 For more details
 about pool recovery mode, see the
 .Fl X
 option, above.
 .Em WARNING :
 This option can be extremely hazardous to the
 health of your pool and should only be used as a last resort.
 .It Fl t
 Used with
 .Ar newpool .
 Specifies that
 .Ar newpool
 is temporary.
 Temporary pool names last until export.
 Ensures that the original pool name will be used
 in all label updates and therefore is retained upon export.
 Will also set
 .Fl o Sy cachefile Ns = Ns Sy none
 when not explicitly specified.
 .El
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 9 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 9 : No Importing a ZFS Storage Pool
 The following command displays available pools, and then imports the pool
 .Ar tank
 for use on the system.
 The results from this command are similar to the following:
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm import
   pool: tank
     id: 15451357997522795478
  state: ONLINE
 action: The pool can be imported using its name or numeric identifier.
 config:
 
         tank        ONLINE
           mirror    ONLINE
             sda     ONLINE
             sdb     ONLINE
 
 .No # Nm zpool Cm import Ar tank
 .Ed
 .
 .Sh SEE ALSO
 .Xr zpool-export 8 ,
 .Xr zpool-list 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-initialize.8 b/sys/contrib/openzfs/man/man8/zpool-initialize.8
index 39579a58010e..5299a897cb97 100644
--- a/sys/contrib/openzfs/man/man8/zpool-initialize.8
+++ b/sys/contrib/openzfs/man/man8/zpool-initialize.8
@@ -1,87 +1,87 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
-.Dd May 27, 2021
+.Dd July 30, 2025
 .Dt ZPOOL-INITIALIZE 8
 .Os
 .
 .Sh NAME
 .Nm zpool-initialize
 .Nd write to unallocated regions of ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm initialize
 .Op Fl c Ns | Ns Fl s | Ns Fl u
 .Op Fl w
 .Fl a Ns | Ns Ar pool
 .Oo Ar device Oc Ns …
 .
 .Sh DESCRIPTION
 Begins initializing by writing to all unallocated regions on the specified
 devices, or all eligible devices in the pool if no individual devices are
 specified.
 Only leaf data or log devices may be initialized.
 .Bl -tag -width Ds
 .It Fl a , -all
 Begin, cancel, suspend initializing on
 all
 pools.
 .It Fl c , -cancel
 Cancel initializing on the specified devices, or all eligible devices if none
 are specified.
 If one or more target devices are invalid or are not currently being
 initialized, the command will fail and no cancellation will occur on any device.
 .It Fl s , -suspend
 Suspend initializing on the specified devices, or all eligible devices if none
 are specified.
 If one or more target devices are invalid or are not currently being
 initialized, the command will fail and no suspension will occur on any device.
 Initializing can then be resumed by running
 .Nm zpool Cm initialize
 with no flags on the relevant target devices.
 .It Fl u , -uninit
 Clears the initialization state on the specified devices, or all eligible
 devices if none are specified.
 If the devices are being actively initialized the command will fail.
 After being cleared
 .Nm zpool Cm initialize
 with no flags can be used to re-initialize all unallocated regions on
 the relevant target devices.
 .It Fl w , -wait
 Wait until the devices have finished initializing before returning.
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-add 8 ,
 .Xr zpool-attach 8 ,
 .Xr zpool-create 8 ,
 .Xr zpool-online 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-trim 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-iostat.8 b/sys/contrib/openzfs/man/man8/zpool-iostat.8
index d8c21d0cfc6c..5dd9c9d55e20 100644
--- a/sys/contrib/openzfs/man/man8/zpool-iostat.8
+++ b/sys/contrib/openzfs/man/man8/zpool-iostat.8
@@ -1,307 +1,307 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd January 29, 2024
 .Dt ZPOOL-IOSTAT 8
 .Os
 .
 .Sh NAME
 .Nm zpool-iostat
 .Nd display logical I/O statistics for ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm iostat
 .Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl ghHLnpPvy
 .Oo Ar pool Ns … Ns | Ns Oo Ar pool vdev Ns … Oc Ns | Ns Ar vdev Ns … Oc
 .Op Ar interval Op Ar count
 .
 .Sh DESCRIPTION
 Displays logical I/O statistics for the given pools/vdevs.
 Physical I/O statistics may be observed via
 .Xr iostat 1 .
 If writes are located nearby, they may be merged into a single
 larger operation.
 Additional I/O may be generated depending on the level of vdev redundancy.
 To filter output, you may pass in a list of pools, a pool and list of vdevs
 in that pool, or a list of any vdevs from any pool.
 If no items are specified, statistics for every pool in the system are shown.
 When given an
 .Ar interval ,
 the statistics are printed every
 .Ar interval
 seconds until killed.
 If
 .Fl n
 flag is specified the headers are displayed only once, otherwise they are
 displayed periodically.
 If
 .Ar count
 is specified, the command exits after
 .Ar count
 reports are printed.
 The first report printed is always the statistics since boot regardless of
 whether
 .Ar interval
 and
 .Ar count
 are passed.
 However, this behavior can be suppressed with the
 .Fl y
 flag.
 Also note that the units of
 .Sy K ,
 .Sy M ,
 .Sy G Ns …
 that are printed in the report are in base 1024.
 To get the raw values, use the
 .Fl p
 flag.
 .Bl -tag -width Ds
 .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns …
 Run a script (or scripts) on each vdev and include the output as a new column
 in the
 .Nm zpool Cm iostat
 output.
 Users can run any script found in their
 .Pa ~/.zpool.d
 directory or from the system
 .Pa /etc/zfs/zpool.d
 directory.
 Script names containing the slash
 .Pq Sy /
 character are not allowed.
 The default search path can be overridden by setting the
 .Sy ZPOOL_SCRIPTS_PATH
 environment variable.
 A privileged user can only run
 .Fl c
 if they have the
 .Sy ZPOOL_SCRIPTS_AS_ROOT
 environment variable set.
 If a script requires the use of a privileged command, like
 .Xr smartctl 8 ,
 then it's recommended you allow the user access to it in
 .Pa /etc/sudoers
 or add the user to the
 .Pa /etc/sudoers.d/zfs
 file.
 .Pp
 If
 .Fl c
 is passed without a script name, it prints a list of all scripts.
 .Fl c
 also sets verbose mode
 .No \&( Ns Fl v Ns No \&) .
 .Pp
 Script output should be in the form of "name=value".
 The column name is set to "name" and the value is set to "value".
 Multiple lines can be used to output multiple columns.
 The first line of output not in the
 "name=value" format is displayed without a column title,
 and no more output after that is displayed.
 This can be useful for printing error messages.
 Blank or NULL values are printed as a '-' to make output AWKable.
 .Pp
 The following environment variables are set before running each script:
 .Bl -tag -compact -width "VDEV_ENC_SYSFS_PATH"
 .It Sy VDEV_PATH
 Full path to the vdev
 .It Sy VDEV_UPATH
 Underlying path to the vdev
 .Pq Pa /dev/sd* .
 For use with device mapper, multipath, or partitioned vdevs.
 .It Sy VDEV_ENC_SYSFS_PATH
 The sysfs path to the enclosure for the vdev (if any).
 .El
 .It Fl T Sy u Ns | Ns Sy d
 Display a time stamp.
 Specify
 .Sy u
 for a printed representation of the internal representation of time.
 See
 .Xr time 1 .
 Specify
 .Sy d
 for standard date format.
 See
 .Xr date 1 .
 .It Fl g
 Display vdev GUIDs instead of the normal device names.
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
 .It Fl H
 Scripted mode.
 Do not display headers, and separate fields by a
 single tab instead of arbitrary space.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
 .Pa /dev/disk/
 path used to open it.
 .It Fl n
 Print headers only once when passed
 .It Fl p
 Display numbers in parsable (exact) values.
 Time values are in nanoseconds.
 .It Fl P
 Display full paths for vdevs instead of only the last component of the path.
 This can be used in conjunction with the
 .Fl L
 flag.
 .It Fl r
 Print request size histograms for the leaf vdev's I/O.
 This includes histograms of individual I/O (ind) and aggregate I/O (agg).
 These stats can be useful for observing how well I/O aggregation is working.
 Note that TRIM I/O may exceed 16M, but will be counted as 16M.
 .It Fl v
 Verbose statistics Reports usage statistics for individual vdevs within the
 pool, in addition to the pool-wide statistics.
 .It Fl y
 Normally the first line of output reports the statistics since boot:
 suppress it.
 .It Fl w
 Display latency histograms:
 .Bl -tag -compact -width "asyncq_read/write"
 .It Sy total_wait
 Total I/O time (queuing + disk I/O time).
 .It Sy disk_wait
 Disk I/O time (time reading/writing the disk).
 .It Sy syncq_wait
 Amount of time I/O spent in synchronous priority queues.
 Does not include disk time.
 .It Sy asyncq_wait
 Amount of time I/O spent in asynchronous priority queues.
 Does not include disk time.
 .It Sy scrub
 Amount of time I/O spent in scrub queue.
 Does not include disk time.
 .It Sy rebuild
 Amount of time I/O spent in rebuild queue.
 Does not include disk time.
 .El
 .It Fl l
 Include average latency statistics:
 .Bl -tag -compact -width "asyncq_read/write"
 .It Sy total_wait
 Average total I/O time (queuing + disk I/O time).
 .It Sy disk_wait
 Average disk I/O time (time reading/writing the disk).
 .It Sy syncq_wait
 Average amount of time I/O spent in synchronous priority queues.
 Does not include disk time.
 .It Sy asyncq_wait
 Average amount of time I/O spent in asynchronous priority queues.
 Does not include disk time.
 .It Sy scrub
 Average queuing time in scrub queue.
 Does not include disk time.
 .It Sy trim
 Average queuing time in trim queue.
 Does not include disk time.
 .It Sy rebuild
 Average queuing time in rebuild queue.
 Does not include disk time.
 .El
 .It Fl q
 Include active queue statistics.
 Each priority queue has both pending
 .Sy ( pend )
 and active
 .Sy ( activ )
 I/O requests.
 Pending requests are waiting to be issued to the disk,
 and active requests have been issued to disk and are waiting for completion.
 These stats are broken out by priority queue:
 .Bl -tag -compact -width "asyncq_read/write"
 .It Sy syncq_read/write
 Current number of entries in synchronous priority
 queues.
 .It Sy asyncq_read/write
 Current number of entries in asynchronous priority queues.
 .It Sy scrubq_read
 Current number of entries in scrub queue.
 .It Sy trimq_write
 Current number of entries in trim queue.
 .It Sy rebuildq_write
 Current number of entries in rebuild queue.
 .El
 .Pp
 All queue statistics are instantaneous measurements of the number of
 entries in the queues.
 If you specify an interval,
 the measurements will be sampled from the end of the interval.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 13, 16 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 13 : No Adding Cache Devices to a ZFS Pool
 The following command adds two disks for use as cache devices to a ZFS storage
 pool:
 .Dl # Nm zpool Cm add Ar pool Sy cache Pa sdc sdd
 .Pp
 Once added, the cache devices gradually fill with content from main memory.
 Depending on the size of your cache devices, it could take over an hour for
 them to fill.
 Capacity and reads can be monitored using the
 .Cm iostat
 subcommand as follows:
 .Dl # Nm zpool Cm iostat Fl v Ar pool 5
 .
 .Ss Example 16 : No Adding output columns
 Additional columns can be added to the
 .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c .
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm status Fl c Pa vendor , Ns Pa model , Ns Pa size
    NAME     STATE  READ WRITE CKSUM vendor  model        size
    tank     ONLINE 0    0     0
    mirror-0 ONLINE 0    0     0
    U1       ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U10      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U11      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U12      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U13      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U14      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
 
 .No # Nm zpool Cm iostat Fl vc Pa size
               capacity     operations     bandwidth
 pool        alloc   free   read  write   read  write  size
 ----------  -----  -----  -----  -----  -----  -----  ----
 rpool       14.6G  54.9G      4     55   250K  2.69M
   sda1      14.6G  54.9G      4     55   250K  2.69M   70G
 ----------  -----  -----  -----  -----  -----  -----  ----
 .Ed
 .
 .Sh SEE ALSO
 .Xr iostat 1 ,
 .Xr smartctl 8 ,
 .Xr zpool-list 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-labelclear.8 b/sys/contrib/openzfs/man/man8/zpool-labelclear.8
index ba3d1509aa75..b807acaaede3 100644
--- a/sys/contrib/openzfs/man/man8/zpool-labelclear.8
+++ b/sys/contrib/openzfs/man/man8/zpool-labelclear.8
@@ -1,62 +1,62 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 31, 2021
+.Dd July 11, 2022
 .Dt ZPOOL-LABELCLEAR 8
 .Os
 .
 .Sh NAME
 .Nm zpool-labelclear
 .Nd remove ZFS label information from device
 .Sh SYNOPSIS
 .Nm zpool
 .Cm labelclear
 .Op Fl f
 .Ar device
 .
 .Sh DESCRIPTION
 Removes ZFS label information from the specified
 .Ar device .
 If the
 .Ar device
 is a cache device, it also removes the L2ARC header
 (persistent L2ARC).
 The
 .Ar device
 must not be part of an active pool configuration.
 .Bl -tag -width Ds
 .It Fl f
 Treat exported or foreign devices as inactive.
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-destroy 8 ,
 .Xr zpool-detach 8 ,
 .Xr zpool-remove 8 ,
 .Xr zpool-replace 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-list.8 b/sys/contrib/openzfs/man/man8/zpool-list.8
index b720e203c1c9..106399941f98 100644
--- a/sys/contrib/openzfs/man/man8/zpool-list.8
+++ b/sys/contrib/openzfs/man/man8/zpool-list.8
@@ -1,254 +1,254 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd October 12, 2024
 .Dt ZPOOL-LIST 8
 .Os
 .
 .Sh NAME
 .Nm zpool-list
 .Nd list information about ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm list
 .Op Fl HgLpPv
 .Op Fl j Op Ar --json-int, --json-pool-key-guid
 .Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns …
 .Op Fl T Sy u Ns | Ns Sy d
 .Oo Ar pool Oc Ns …
 .Op Ar interval Op Ar count
 .
 .Sh DESCRIPTION
 Lists the given pools along with a health status and space usage.
 If no
 .Ar pool Ns s
 are specified, all pools in the system are listed.
 When given an
 .Ar interval ,
 the information is printed every
 .Ar interval
 seconds until killed.
 If
 .Ar count
 is specified, the command exits after
 .Ar count
 reports are printed.
 .Bl -tag -width Ds
 .It Fl j , -json Op Ar --json-int, --json-pool-key-guid
 Display the list of pools in JSON format.
 Specify
 .Sy --json-int
 to display the numbers in integer format instead of strings.
 Specify
 .Sy --json-pool-key-guid
 to set pool GUID as key for pool objects instead of pool names.
 .It Fl g
 Display vdev GUIDs instead of the normal device names.
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
 .It Fl H
 Scripted mode.
 Do not display headers, and separate fields by a single tab instead of arbitrary
 space.
 .It Fl o Ar property
 Comma-separated list of properties to display.
 See the
 .Xr zpoolprops 7
 manual page for a list of valid properties.
 The default list is
 .Sy name , size , allocated , free , checkpoint, expandsize , fragmentation ,
 .Sy capacity , dedupratio , health , altroot .
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
 .Pa /dev/disk
 path used to open it.
 .It Fl p
 Display numbers in parsable
 .Pq exact
 values.
 .It Fl P
 Display full paths for vdevs instead of only the last component of
 the path.
 This can be used in conjunction with the
 .Fl L
 flag.
 .It Fl T Sy u Ns | Ns Sy d
 Display a time stamp.
 Specify
 .Sy u
 for a printed representation of the internal representation of time.
 See
 .Xr time 1 .
 Specify
 .Sy d
 for standard date format.
 See
 .Xr date 1 .
 .It Fl v
 Verbose statistics.
 Reports usage statistics for individual vdevs within the pool, in addition to
 the pool-wide statistics.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 6, 15 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Listing Available ZFS Storage Pools
 The following command lists all available pools on the system.
 In this case, the pool
 .Ar zion
 is faulted due to a missing device.
 The results from this command are similar to the following:
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm list
 NAME    SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
 rpool  19.9G  8.43G  11.4G         -    33%    42%  1.00x  ONLINE  -
 tank   61.5G  20.0G  41.5G         -    48%    32%  1.00x  ONLINE  -
 zion       -      -      -         -      -      -      -  FAULTED -
 .Ed
 .
 .Ss Example 2 : No Displaying expanded space on a device
 The following command displays the detailed information for the pool
 .Ar data .
 This pool is comprised of a single raidz vdev where one of its devices
 increased its capacity by 10 GiB.
 In this example, the pool will not be able to utilize this extra capacity until
 all the devices under the raidz vdev have been expanded.
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm list Fl v Ar data
 NAME         SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
 data        23.9G  14.6G  9.30G         -    48%    61%  1.00x  ONLINE  -
   raidz1    23.9G  14.6G  9.30G         -    48%
     sda         -      -      -         -      -
     sdb         -      -      -       10G      -
     sdc         -      -      -         -      -
 .Ed
 .
 .Ss Example 3 : No Displaying expanded space on a device
 The following command lists all available pools on the system in JSON
 format.
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm list Fl j | Nm jq
 {
   "output_version": {
     "command": "zpool list",
     "vers_major": 0,
     "vers_minor": 1
   },
   "pools": {
     "tank": {
       "name": "tank",
       "type": "POOL",
       "state": "ONLINE",
       "guid": "15220353080205405147",
       "txg": "2671",
       "spa_version": "5000",
       "zpl_version": "5",
       "properties": {
         "size": {
           "value": "111G",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "allocated": {
           "value": "30.8G",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "free": {
           "value": "80.2G",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "checkpoint": {
           "value": "-",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "expandsize": {
           "value": "-",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "fragmentation": {
           "value": "0%",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "capacity": {
           "value": "27%",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "dedupratio": {
           "value": "1.00x",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "health": {
           "value": "ONLINE",
           "source": {
             "type": "NONE",
             "data": "-"
           }
         },
         "altroot": {
           "value": "-",
           "source": {
             "type": "DEFAULT",
             "data": "-"
           }
         }
       }
     }
   }
 }
 
 .Ed
 .
 .Sh SEE ALSO
 .Xr zpool-import 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-offline.8 b/sys/contrib/openzfs/man/man8/zpool-offline.8
index 49b1f34ad5d5..388c7634acce 100644
--- a/sys/contrib/openzfs/man/man8/zpool-offline.8
+++ b/sys/contrib/openzfs/man/man8/zpool-offline.8
@@ -1,107 +1,107 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd December 21, 2023
 .Dt ZPOOL-OFFLINE 8
 .Os
 .
 .Sh NAME
 .Nm zpool-offline
 .Nd take physical devices offline in ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm offline
 .Op Fl Sy -power Ns | Ns Op Fl Sy ft
 .Ar pool
 .Ar device Ns …
 .Nm zpool
 .Cm online
 .Op Fl Sy -power
 .Op Fl Sy e
 .Ar pool
 .Ar device Ns …
 .
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Xo
 .Nm zpool
 .Cm offline
 .Op Fl Sy -power Ns | Ns Op Fl Sy ft
 .Ar pool
 .Ar device Ns …
 .Xc
 Takes the specified physical device offline.
 While the
 .Ar device
 is offline, no attempt is made to read or write to the device.
 This command is not applicable to spares.
 .Bl -tag -width Ds
 .It Fl -power
 Power off the device's slot in the storage enclosure.
 This flag currently works on Linux only
 .It Fl f
 Force fault.
 Instead of offlining the disk, put it into a faulted state.
 The fault will persist across imports unless the
 .Fl t
 flag was specified.
 .It Fl t
 Temporary.
 Upon reboot, the specified physical device reverts to its previous state.
 .El
 .It Xo
 .Nm zpool
 .Cm online
 .Op Fl -power
 .Op Fl e
 .Ar pool
 .Ar device Ns …
 .Xc
 Brings the specified physical device online.
 This command is not applicable to spares.
 .Bl -tag -width Ds
 .It Fl -power
 Power on the device's slot in the storage enclosure and wait for the device
 to show up before attempting to online it.
 Alternatively, you can set the
 .Sy ZPOOL_AUTO_POWER_ON_SLOT
 environment variable to always enable this behavior.
 This flag currently works on Linux only
 .It Fl e
 Expand the device to use all available space.
 If the device is part of a mirror or raidz then all devices must be expanded
 before the new space will become available to the pool.
 .El
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-detach 8 ,
 .Xr zpool-remove 8 ,
 .Xr zpool-reopen 8 ,
 .Xr zpool-resilver 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-reguid.8 b/sys/contrib/openzfs/man/man8/zpool-reguid.8
index 77101fc07326..b98c88e320de 100644
--- a/sys/contrib/openzfs/man/man8/zpool-reguid.8
+++ b/sys/contrib/openzfs/man/man8/zpool-reguid.8
@@ -1,61 +1,61 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2024, Klara Inc.
 .\" Copyright (c) 2024, Mateusz Piotrowski
 .\"
-.Dd June 21, 2023
+.Dd August 26, 2024
 .Dt ZPOOL-REGUID 8
 .Os
 .
 .Sh NAME
 .Nm zpool-reguid
 .Nd generate new unique identifier for ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm reguid
 .Op Fl g Ar guid
 .Ar pool
 .
 .Sh DESCRIPTION
 Generates a new unique identifier for the pool.
 You must ensure that all devices in this pool are online and healthy before
 performing this action.
 .
 .Bl -tag -width Ds
 .It Fl g Ar guid
 Set the pool GUID to the provided value.
 The GUID can be any 64-bit value accepted by
 .Xr strtoull 3
 in base 10.
 .Nm
 will return an error if the provided GUID is already in use.
 .El
 .Sh SEE ALSO
 .Xr zpool-export 8 ,
 .Xr zpool-import 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-remove.8 b/sys/contrib/openzfs/man/man8/zpool-remove.8
index d10a92e49bbe..4d5fc431d332 100644
--- a/sys/contrib/openzfs/man/man8/zpool-remove.8
+++ b/sys/contrib/openzfs/man/man8/zpool-remove.8
@@ -1,190 +1,190 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd March 16, 2022
+.Dd November 19, 2024
 .Dt ZPOOL-REMOVE 8
 .Os
 .
 .Sh NAME
 .Nm zpool-remove
 .Nd remove devices from ZFS storage pool
 .
 .Sh SYNOPSIS
 .Nm zpool
 .Cm remove
 .Op Fl npw
 .Ar pool Ar device Ns …
 .Nm zpool
 .Cm remove
 .Fl s
 .Ar pool
 .
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Xo
 .Nm zpool
 .Cm remove
 .Op Fl npw
 .Ar pool Ar device Ns …
 .Xc
 Removes the specified device from the pool.
 This command supports removing hot spare, cache, log, and both mirrored and
 non-redundant primary top-level vdevs, including dedup and special vdevs.
 .Pp
 Top-level vdevs can only be removed if the primary pool storage does not contain
 a top-level raidz vdev, all top-level vdevs have the same sector size, and the
 keys for all encrypted datasets are loaded.
 .Pp
 Removing a top-level vdev reduces the total amount of space in the storage pool.
 The specified device will be evacuated by copying all allocated space from it to
 the other devices in the pool.
 In this case, the
 .Nm zpool Cm remove
 command initiates the removal and returns, while the evacuation continues in
 the background.
 The removal progress can be monitored with
 .Nm zpool Cm status .
 If an I/O error is encountered during the removal process it will be canceled.
 The
 .Sy device_removal
 feature flag must be enabled to remove a top-level vdev, see
 .Xr zpool-features 7 .
 .Pp
 A mirrored top-level device (log or data) can be removed by specifying the top-
 level mirror for the
 same.
 Non-log devices or data devices that are part of a mirrored configuration can be
 removed using
 the
 .Nm zpool Cm detach
 command.
 .Bl -tag -width Ds
 .It Fl n
 Do not actually perform the removal
 .Pq Qq No-op .
 Instead, print the estimated amount of memory that will be used by the
 mapping table after the removal completes.
 This is nonzero only for top-level vdevs.
 .El
 .Bl -tag -width Ds
 .It Fl p
 Used in conjunction with the
 .Fl n
 flag, displays numbers as parsable (exact) values.
 .It Fl w
 Waits until the removal has completed before returning.
 .El
 .It Xo
 .Nm zpool
 .Cm remove
 .Fl s
 .Ar pool
 .Xc
 Stops and cancels an in-progress removal of a top-level vdev.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 15 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Removing a Mirrored top-level (Log or Data) Device
 The following commands remove the mirrored log device
 .Sy mirror-2
 and mirrored top-level data device
 .Sy mirror-1 .
 .Pp
 Given this configuration:
 .Bd -literal -compact -offset Ds
   pool: tank
  state: ONLINE
  scrub: none requested
 config:
 
          NAME        STATE     READ WRITE CKSUM
          tank        ONLINE       0     0     0
            mirror-0  ONLINE       0     0     0
              sda     ONLINE       0     0     0
              sdb     ONLINE       0     0     0
            mirror-1  ONLINE       0     0     0
              sdc     ONLINE       0     0     0
              sdd     ONLINE       0     0     0
          logs
            mirror-2  ONLINE       0     0     0
              sde     ONLINE       0     0     0
              sdf     ONLINE       0     0     0
 .Ed
 .Pp
 The command to remove the mirrored log
 .Ar mirror-2 No is :
 .Dl # Nm zpool Cm remove Ar tank mirror-2
 .Pp
 At this point, the log device no longer exists
 (both sides of the mirror have been removed):
 .Bd -literal -compact -offset Ds
   pool: tank
  state: ONLINE
   scan: none requested
 config:
 
         NAME        STATE     READ WRITE CKSUM
         tank        ONLINE       0     0     0
           mirror-0  ONLINE       0     0     0
             sda     ONLINE       0     0     0
             sdb     ONLINE       0     0     0
           mirror-1  ONLINE       0     0     0
             sdc     ONLINE       0     0     0
             sdd     ONLINE       0     0     0
 .Ed
 .Pp
 The command to remove the mirrored data
 .Ar mirror-1 No is :
 .Dl # Nm zpool Cm remove Ar tank mirror-1
 .Pp
 After
 .Ar mirror-1 No has been evacuated, the pool remains redundant, but
 the total amount of space is reduced:
 .Bd -literal -compact -offset Ds
   pool: tank
  state: ONLINE
   scan: none requested
 config:
 
         NAME        STATE     READ WRITE CKSUM
         tank        ONLINE       0     0     0
           mirror-0  ONLINE       0     0     0
             sda     ONLINE       0     0     0
             sdb     ONLINE       0     0     0
 .Ed
 .
 .Sh SEE ALSO
 .Xr zpool-add 8 ,
 .Xr zpool-detach 8 ,
 .Xr zpool-labelclear 8 ,
 .Xr zpool-offline 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-split 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-reopen.8 b/sys/contrib/openzfs/man/man8/zpool-reopen.8
index 594cff3d16d8..c4e10f0a546e 100644
--- a/sys/contrib/openzfs/man/man8/zpool-reopen.8
+++ b/sys/contrib/openzfs/man/man8/zpool-reopen.8
@@ -1,53 +1,53 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd June 2, 2021
+.Dd July 11, 2022
 .Dt ZPOOL-REOPEN 8
 .Os
 .
 .Sh NAME
 .Nm zpool-reopen
 .Nd reopen vdevs associated with ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm reopen
 .Op Fl n
 .Oo Ar pool Oc Ns …
 .
 .Sh DESCRIPTION
 Reopen all vdevs associated with the specified pools,
 or all pools if none specified.
 .
 .Sh OPTIONS
 .Bl -tag -width "-n"
 .It Fl n
 Do not restart an in-progress scrub operation.
 This is not recommended and can
 result in partially resilvered devices unless a second scrub is performed.
 .El
diff --git a/sys/contrib/openzfs/man/man8/zpool-replace.8 b/sys/contrib/openzfs/man/man8/zpool-replace.8
index 9f3156eeb3ef..651af13b19b8 100644
--- a/sys/contrib/openzfs/man/man8/zpool-replace.8
+++ b/sys/contrib/openzfs/man/man8/zpool-replace.8
@@ -1,100 +1,100 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 29, 2021
+.Dd July 11, 2022
 .Dt ZPOOL-REPLACE 8
 .Os
 .
 .Sh NAME
 .Nm zpool-replace
 .Nd replace one device with another in ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm replace
 .Op Fl fsw
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool Ar device Op Ar new-device
 .
 .Sh DESCRIPTION
 Replaces
 .Ar device
 with
 .Ar new-device .
 This is equivalent to attaching
 .Ar new-device ,
 waiting for it to resilver, and then detaching
 .Ar device .
 Any in progress scrub will be canceled.
 .Pp
 The size of
 .Ar new-device
 must be greater than or equal to the minimum size of all the devices in a mirror
 or raidz configuration.
 .Pp
 .Ar new-device
 is required if the pool is not redundant.
 If
 .Ar new-device
 is not specified, it defaults to
 .Ar device .
 This form of replacement is useful after an existing disk has failed and has
 been physically replaced.
 In this case, the new disk may have the same
 .Pa /dev
 path as the old device, even though it is actually a different disk.
 ZFS recognizes this.
 .Bl -tag -width Ds
 .It Fl f
 Forces use of
 .Ar new-device ,
 even if it appears to be in use.
 Not all devices can be overridden in this manner.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the given pool properties.
 See the
 .Xr zpoolprops 7
 manual page for a list of valid properties that can be set.
 The only property supported at the moment is
 .Sy ashift .
 .It Fl s
 The
 .Ar new-device
 is reconstructed sequentially to restore redundancy as quickly as possible.
 Checksums are not verified during sequential reconstruction so a scrub is
 started when the resilver completes.
 Sequential reconstruction is not supported for raidz configurations.
 .It Fl w
 Waits until the replacement has completed before returning.
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-detach 8 ,
 .Xr zpool-initialize 8 ,
 .Xr zpool-online 8 ,
 .Xr zpool-resilver 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-resilver.8 b/sys/contrib/openzfs/man/man8/zpool-resilver.8
index 2161d77f62ed..59c4be5db209 100644
--- a/sys/contrib/openzfs/man/man8/zpool-resilver.8
+++ b/sys/contrib/openzfs/man/man8/zpool-resilver.8
@@ -1,58 +1,58 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd July 11, 2022
 .Dt ZPOOL-RESILVER 8
 .Os
 .
 .Sh NAME
 .Nm zpool-resilver
 .Nd resilver devices in ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm resilver
 .Ar pool Ns …
 .
 .Sh DESCRIPTION
 Starts a resilver of the specified pools.
 If an existing resilver is already running it will be restarted from the
 beginning.
 Any drives that were scheduled for a deferred
 resilver will be added to the new one.
 This requires the
 .Sy resilver_defer
 pool feature.
 .
 .Sh SEE ALSO
 .Xr zpool-iostat 8 ,
 .Xr zpool-online 8 ,
 .Xr zpool-reopen 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-scrub 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-scrub.8 b/sys/contrib/openzfs/man/man8/zpool-scrub.8
index 0ecf8bd3851f..cf7ead5788bf 100644
--- a/sys/contrib/openzfs/man/man8/zpool-scrub.8
+++ b/sys/contrib/openzfs/man/man8/zpool-scrub.8
@@ -1,210 +1,210 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018, 2021 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
-.Dd December 11, 2024
+.Dd August 6, 2025
 .Dt ZPOOL-SCRUB 8
 .Os
 .
 .Sh NAME
 .Nm zpool-scrub
 .Nd begin or resume scrub of ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm scrub
 .Op Ns Fl e | Ns Fl p | Fl s Ns | Fl C Ns
 .Op Fl w
 .Op Fl S Ar date
 .Op Fl E Ar date
 .Fl a Ns | Ns Ar pool Ns …
 .
 .Sh DESCRIPTION
 Begins a scrub or resumes a paused scrub.
 The scrub examines all data in the specified pools to verify that it checksums
 correctly.
 For replicated
 .Pq mirror, raidz, or draid
 devices, ZFS automatically repairs any damage discovered during the scrub.
 The
 .Nm zpool Cm status
 command reports the progress of the scrub and summarizes the results of the
 scrub upon completion.
 .Pp
 Scrubbing and resilvering are very similar operations.
 The difference is that resilvering only examines data that ZFS knows to be out
 of date
 .Po
 for example, when attaching a new device to a mirror or replacing an existing
 device
 .Pc ,
 whereas scrubbing examines all data to discover silent errors due to hardware
 faults or disk failure.
 .Pp
 When scrubbing a pool with encrypted filesystems the keys do not need to be
 loaded.
 However, if the keys are not loaded and an unrepairable checksum error is
 detected the file name cannot be included in the
 .Nm zpool Cm status Fl v
 verbose error report.
 .Pp
 Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows
 one at a time.
 .Pp
 A scrub is split into two parts: metadata scanning and block scrubbing.
 The metadata scanning sorts blocks into large sequential ranges which can then
 be read much more efficiently from disk when issuing the scrub I/O.
 .Pp
 If a scrub is paused, the
 .Nm zpool Cm scrub
 resumes it.
 If a resilver is in progress, ZFS does not allow a scrub to be started until the
 resilver completes.
 .Pp
 Note that, due to changes in pool data on a live system, it is possible for
 scrubs to progress slightly beyond 100% completion.
 During this period, no completion time estimate will be provided.
 .
 .Sh OPTIONS
 .Bl -tag -width "-s"
 .It Fl a , -all
 Begin, pause, stop scrub on
 all
 pools.
 Initiating scrubs on multiple pools can put considerable load and memory
 pressure on the system, so this operation should be performed with caution.
 .It Fl s
 Stop scrubbing.
 .It Fl p
 Pause scrubbing.
 Scrub pause state and progress are periodically synced to disk.
 If the system is restarted or pool is exported during a paused scrub,
 even after import, scrub will remain paused until it is resumed.
 Once resumed the scrub will pick up from the place where it was last
 checkpointed to disk.
 To resume a paused scrub issue
 .Nm zpool Cm scrub
 or
 .Nm zpool Cm scrub
 .Fl e
 again.
 .It Fl w
 Wait until scrub has completed before returning.
 .It Fl e
 Only scrub files with known data errors as reported by
 .Nm zpool Cm status Fl v .
 The pool must have been scrubbed at least once with the
 .Sy head_errlog
 feature enabled to use this option.
 Error scrubbing cannot be run simultaneously with regular scrubbing or
 resilvering, nor can it be run when a regular scrub is paused.
 .It Fl C
 Continue scrub from last saved txg (see zpool
 .Sy last_scrubbed_txg
 property).
 .It Fl S Ar date , Fl E Ar date
 Allows specifying the date range for blocks created between these dates.
 .Bl -bullet -compact -offset indent
 .It
 .Fl S
 Defines a start date.
 If not specified, scrubbing begins from the start of the pool's
 existence.
 .It
 .Fl E
 Defines an end date.
 If not specified, scrubbing continues up to the most recent data.
 .El
 The provided date should be in the format:
 .Dq YYYY-MM-DD HH:MM .
 Where:
 .Bl -bullet -compact -offset indent
 .It
 .Dq YYYY
 is the year.
 .It
 .Dq MM
 is the numeric representation of the month.
 .It
 .Dq DD
 is the day of the month.
 .It
 .Dq HH
 is the hour.
 .It
 .Dq MM
 is the minutes.
 .El
 The hour and minutes parameters can be omitted.
 The time should be provided in machine local time zone.
 Specifying dates prior to enabling this feature will result in scrubbing
 starting from the date the pool was created.
 If the time was moved backward manually the data range may become inaccurate.
 .El
 .Sh EXAMPLES
 .Ss Example 1
 Status of pool with ongoing scrub:
 .sp
 .Bd -literal -compact
 .No # Nm zpool Cm status
   ...
   scan: scrub in progress since Sun Jul 25 16:07:49 2021
         403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s
         0B repaired, 16.91% done, 00:00:04 to go
   ...
 .Ed
 .Pp
 Where metadata which references 403M of file data has been
 scanned at 100M/s, and 68.4M of that file data has been
 scrubbed sequentially at 10.0M/s.
 .Sh PERIODIC SCRUB
 On machines using systemd, scrub timers can be enabled on per-pool basis.
 .Nm weekly
 and
 .Nm monthly
 timer units are provided.
 .Bl -tag -width Ds
 .It Xo
 .Xc
 .Nm systemctl
 .Cm enable
 .Cm zfs-scrub-\fIweekly\fB@\fIrpool\fB.timer
 .Cm --now
 .It Xo
 .Xc
 .Nm systemctl
 .Cm enable
 .Cm zfs-scrub-\fImonthly\fB@\fIotherpool\fB.timer
 .Cm --now
 .El
 .
 .Sh SEE ALSO
 .Xr systemd.timer 5 ,
 .Xr zpool-iostat 8 ,
 .Xr zpool-resilver 8 ,
 .Xr zpool-status 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-split.8 b/sys/contrib/openzfs/man/man8/zpool-split.8
index a67c865cf30c..ee4c6384cf23 100644
--- a/sys/contrib/openzfs/man/man8/zpool-split.8
+++ b/sys/contrib/openzfs/man/man8/zpool-split.8
@@ -1,118 +1,118 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd June 2, 2021
+.Dd July 11, 2022
 .Dt ZPOOL-SPLIT 8
 .Os
 .
 .Sh NAME
 .Nm zpool-split
 .Nd split devices off ZFS storage pool, creating new pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm split
 .Op Fl gLlnP
 .Oo Fl o Ar property Ns = Ns Ar value Oc Ns …
 .Op Fl R Ar root
 .Ar pool newpool
 .Oo Ar device Oc Ns …
 .
 .Sh DESCRIPTION
 Splits devices off
 .Ar pool
 creating
 .Ar newpool .
 All vdevs in
 .Ar pool
 must be mirrors and the pool must not be in the process of resilvering.
 At the time of the split,
 .Ar newpool
 will be a replica of
 .Ar pool .
 By default, the
 last device in each mirror is split from
 .Ar pool
 to create
 .Ar newpool .
 .Pp
 The optional device specification causes the specified device(s) to be
 included in the new
 .Ar pool
 and, should any devices remain unspecified,
 the last device in each mirror is used as would be by default.
 .Bl -tag -width Ds
 .It Fl g
 Display vdev GUIDs instead of the normal device names.
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
 .Pa /dev/disk/
 path used to open it.
 .It Fl l
 Indicates that this command will request encryption keys for all encrypted
 datasets it attempts to mount as it is bringing the new pool online.
 Note that if any datasets have
 .Sy keylocation Ns = Ns Sy prompt ,
 this command will block waiting for the keys to be entered.
 Without this flag, encrypted datasets will be left unavailable until the keys
 are loaded.
 .It Fl n
 Do a dry-run
 .Pq Qq No-op
 split: do not actually perform it.
 Print out the expected configuration of
 .Ar newpool .
 .It Fl P
 Display full paths for vdevs instead of only the last component of
 the path.
 This can be used in conjunction with the
 .Fl L
 flag.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the specified property for
 .Ar newpool .
 See the
 .Xr zpoolprops 7
 manual page for more information on the available pool properties.
 .It Fl R Ar root
 Set
 .Sy altroot
 for
 .Ar newpool
 to
 .Ar root
 and automatically import it.
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-import 8 ,
 .Xr zpool-list 8 ,
 .Xr zpool-remove 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-status.8 b/sys/contrib/openzfs/man/man8/zpool-status.8
index a7f3e088043b..108a1067b384 100644
--- a/sys/contrib/openzfs/man/man8/zpool-status.8
+++ b/sys/contrib/openzfs/man/man8/zpool-status.8
@@ -1,372 +1,372 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd February 14, 2024
+.Dd May 20, 2025
 .Dt ZPOOL-STATUS 8
 .Os
 .
 .Sh NAME
 .Nm zpool-status
 .Nd show detailed health status for ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm status
 .Op Fl DdegiLPpstvx
 .Op Fl c Ar script1 Ns Oo , Ns Ar script2 Ns ,… Oc
 .Oo Fl j|--json
 .Oo Ns Fl -json-flat-vdevs Oc
 .Oo Ns Fl -json-int Oc
 .Oo Ns Fl -json-pool-key-guid Oc
 .Oc
 .Op Fl T Ar d|u
 .Op Fl -power
 .Op Ar pool
 .Op Ar interval Op Ar count
 .
 .Sh DESCRIPTION
 Displays the detailed health status for the given pools.
 If no
 .Ar pool
 is specified, then the status of each pool in the system is displayed.
 For more information on pool and device health, see the
 .Sx Device Failure and Recovery
 section of
 .Xr zpoolconcepts 7 .
 .Pp
 If a scrub or resilver is in progress, this command reports the percentage done
 and the estimated time to completion.
 Both of these are only approximate, because the amount of data in the pool and
 the other workloads on the system can change.
 .Bl -tag -width Ds
 .It Fl c Ar script1 Ns Oo , Ns Ar script2 Ns ,… Oc
 Run a script (or scripts) on each vdev and include the output as a new column
 in the
 .Nm zpool Cm status
 output.
 See the
 .Fl c
 option of
 .Nm zpool Cm iostat
 for complete details.
 .It Fl D
 Display a histogram of deduplication statistics, showing the allocated
 .Pq physically present on disk
 and referenced
 .Pq logically referenced in the pool
 block counts and sizes by reference count.
 If repeated, (-DD), also shows statistics on how much of the DDT is resident
 in the ARC.
 .It Fl d
 Display the number of Direct I/O read/write checksum verify errors that have
 occurred on a top-level VDEV.
 See
 .Sx zfs_vdev_direct_write_verify
 in
 .Xr zfs 4
 for details about the conditions that can cause Direct I/O write checksum
 verify failures to occur.
 Direct I/O reads checksum verify errors can also occur if the contents of the
 buffer are being manipulated after the I/O has been issued and is in flight.
 In the case of Direct I/O read checksum verify errors, the I/O will be reissued
 through the ARC.
 .It Fl e
 Only show unhealthy vdevs (not-ONLINE or with errors).
 .It Fl g
 Display vdev GUIDs instead of the normal device names
 These GUIDs can be used in place of device names for the zpool
 detach/offline/remove/replace commands.
 .It Fl i
 Display vdev initialization status.
 .It Fl j , -json Oo Ns Fl -json-flat-vdevs Oc Oo Ns Fl -json-int Oc \
 Oo Ns Fl -json-pool-key-guid Oc
 Display the status for ZFS pools in JSON format.
 Specify
 .Sy --json-flat-vdevs
 to display vdevs in flat hierarchy instead of nested vdev objects.
 Specify
 .Sy --json-int
 to display numbers in integer format instead of strings.
 Specify
 .Sy --json-pool-key-guid
 to set pool GUID as key for pool objects instead of pool names.
 .It Fl L
 Display real paths for vdevs resolving all symbolic links.
 This can be used to look up the current block device name regardless of the
 .Pa /dev/disk/
 path used to open it.
 .It Fl P
 Display full paths for vdevs instead of only the last component of
 the path.
 This can be used in conjunction with the
 .Fl L
 flag.
 .It Fl p
 Display numbers in parsable (exact) values.
 .It Fl -power
 Display vdev enclosure slot power status (on or off).
 .It Fl s
 Display the number of leaf vdev slow I/O operations.
 This is the number of I/O operations that didn't complete in
 .Sy zio_slow_io_ms
 milliseconds
 .Pq Sy 30000 No by default .
 This does not necessarily mean the I/O operations failed to complete, just took
 an
 unreasonably long amount of time.
 This may indicate a problem with the underlying storage.
 .It Fl T Sy d Ns | Ns Sy u
 Display a time stamp.
 Specify
 .Sy d
 for standard date format.
 See
 .Xr date 1 .
 Specify
 .Sy u
 for a printed representation of the internal representation of time.
 See
 .Xr time 1 .
 .It Fl t
 Display vdev TRIM status.
 .It Fl v
 Displays verbose data error information, printing out a complete list of all
 data errors since the last complete pool scrub.
 If the head_errlog feature is enabled and files containing errors have been
 removed then the respective filenames will not be reported in subsequent runs
 of this command.
 .It Fl x
 Only display status for pools that are exhibiting errors or are otherwise
 unavailable.
 Warnings about pools not using the latest on-disk format will not be included.
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 16 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Adding output columns
 Additional columns can be added to the
 .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c .
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm status Fl c Pa vendor , Ns Pa model , Ns Pa size
    NAME     STATE  READ WRITE CKSUM vendor  model        size
    tank     ONLINE 0    0     0
    mirror-0 ONLINE 0    0     0
    U1       ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U10      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U11      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U12      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U13      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U14      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
 
 .No # Nm zpool Cm iostat Fl vc Pa size
               capacity     operations     bandwidth
 pool        alloc   free   read  write   read  write  size
 ----------  -----  -----  -----  -----  -----  -----  ----
 rpool       14.6G  54.9G      4     55   250K  2.69M
   sda1      14.6G  54.9G      4     55   250K  2.69M   70G
 ----------  -----  -----  -----  -----  -----  -----  ----
 .Ed
 .
 .Ss Example 2 : No Display the status output in JSON format
 .Nm zpool Cm status No can output in JSON format if
 .Fl j
 is specified.
 .Fl c
 can be used to run a script on each VDEV.
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm status Fl j Fl c Pa vendor , Ns Pa model , Ns Pa size | Nm jq
 {
   "output_version": {
     "command": "zpool status",
     "vers_major": 0,
     "vers_minor": 1
   },
   "pools": {
     "tank": {
       "name": "tank",
       "state": "ONLINE",
       "guid": "3920273586464696295",
       "txg": "16597",
       "spa_version": "5000",
       "zpl_version": "5",
       "status": "OK",
       "vdevs": {
         "tank": {
           "name": "tank",
           "alloc_space": "62.6G",
           "total_space": "15.0T",
           "def_space": "11.3T",
           "read_errors": "0",
           "write_errors": "0",
           "checksum_errors": "0",
           "vdevs": {
             "raidz1-0": {
               "name": "raidz1-0",
               "vdev_type": "raidz",
               "guid": "763132626387621737",
               "state": "HEALTHY",
               "alloc_space": "62.5G",
               "total_space": "10.9T",
               "def_space": "7.26T",
               "rep_dev_size": "10.9T",
               "read_errors": "0",
               "write_errors": "0",
               "checksum_errors": "0",
               "vdevs": {
                 "ca1eb824-c371-491d-ac13-37637e35c683": {
                   "name": "ca1eb824-c371-491d-ac13-37637e35c683",
                   "vdev_type": "disk",
                   "guid": "12841765308123764671",
                   "path": "/dev/disk/by-partuuid/ca1eb824-c371-491d-ac13-37637e35c683",
                   "state": "HEALTHY",
                   "rep_dev_size": "3.64T",
                   "phys_space": "3.64T",
                   "read_errors": "0",
                   "write_errors": "0",
                   "checksum_errors": "0",
                   "vendor": "ATA",
                   "model": "WDC WD40EFZX-68AWUN0",
                   "size": "3.6T"
                 },
                 "97cd98fb-8fb8-4ac4-bc84-bd8950a7ace7": {
                   "name": "97cd98fb-8fb8-4ac4-bc84-bd8950a7ace7",
                   "vdev_type": "disk",
                   "guid": "1527839927278881561",
                   "path": "/dev/disk/by-partuuid/97cd98fb-8fb8-4ac4-bc84-bd8950a7ace7",
                   "state": "HEALTHY",
                   "rep_dev_size": "3.64T",
                   "phys_space": "3.64T",
                   "read_errors": "0",
                   "write_errors": "0",
                   "checksum_errors": "0",
                   "vendor": "ATA",
                   "model": "WDC WD40EFZX-68AWUN0",
                   "size": "3.6T"
                 },
                 "e9ddba5f-f948-4734-a472-cb8aa5f0ff65": {
                   "name": "e9ddba5f-f948-4734-a472-cb8aa5f0ff65",
                   "vdev_type": "disk",
                   "guid": "6982750226085199860",
                   "path": "/dev/disk/by-partuuid/e9ddba5f-f948-4734-a472-cb8aa5f0ff65",
                   "state": "HEALTHY",
                   "rep_dev_size": "3.64T",
                   "phys_space": "3.64T",
                   "read_errors": "0",
                   "write_errors": "0",
                   "checksum_errors": "0",
                   "vendor": "ATA",
                   "model": "WDC WD40EFZX-68AWUN0",
                   "size": "3.6T"
                 }
               }
             }
           }
         }
       },
       "dedup": {
         "mirror-2": {
           "name": "mirror-2",
           "vdev_type": "mirror",
           "guid": "2227766268377771003",
           "state": "HEALTHY",
           "alloc_space": "89.1M",
           "total_space": "3.62T",
           "def_space": "3.62T",
           "rep_dev_size": "3.62T",
           "read_errors": "0",
           "write_errors": "0",
           "checksum_errors": "0",
           "vdevs": {
             "db017360-d8e9-4163-961b-144ca75293a3": {
               "name": "db017360-d8e9-4163-961b-144ca75293a3",
               "vdev_type": "disk",
               "guid": "17880913061695450307",
               "path": "/dev/disk/by-partuuid/db017360-d8e9-4163-961b-144ca75293a3",
               "state": "HEALTHY",
               "rep_dev_size": "3.63T",
               "phys_space": "3.64T",
               "read_errors": "0",
               "write_errors": "0",
               "checksum_errors": "0",
               "vendor": "ATA",
               "model": "WDC WD40EFZX-68AWUN0",
               "size": "3.6T"
             },
             "952c3baf-b08a-4a8c-b7fa-33a07af5fe6f": {
               "name": "952c3baf-b08a-4a8c-b7fa-33a07af5fe6f",
               "vdev_type": "disk",
               "guid": "10276374011610020557",
               "path": "/dev/disk/by-partuuid/952c3baf-b08a-4a8c-b7fa-33a07af5fe6f",
               "state": "HEALTHY",
               "rep_dev_size": "3.63T",
               "phys_space": "3.64T",
               "read_errors": "0",
               "write_errors": "0",
               "checksum_errors": "0",
               "vendor": "ATA",
               "model": "WDC WD40EFZX-68AWUN0",
               "size": "3.6T"
             }
           }
         }
       },
       "special": {
         "25d418f8-92bd-4327-b59f-7ef5d5f50d81": {
           "name": "25d418f8-92bd-4327-b59f-7ef5d5f50d81",
           "vdev_type": "disk",
           "guid": "3935742873387713123",
           "path": "/dev/disk/by-partuuid/25d418f8-92bd-4327-b59f-7ef5d5f50d81",
           "state": "HEALTHY",
           "alloc_space": "37.4M",
           "total_space": "444G",
           "def_space": "444G",
           "rep_dev_size": "444G",
           "phys_space": "447G",
           "read_errors": "0",
           "write_errors": "0",
           "checksum_errors": "0",
           "vendor": "ATA",
           "model": "Micron_5300_MTFDDAK480TDS",
           "size": "447.1G"
         }
       },
       "error_count": "0"
     }
   }
 }
 .Ed
 .
 .Sh SEE ALSO
 .Xr zpool-events 8 ,
 .Xr zpool-history 8 ,
 .Xr zpool-iostat 8 ,
 .Xr zpool-list 8 ,
 .Xr zpool-resilver 8 ,
 .Xr zpool-scrub 8 ,
 .Xr zpool-wait 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-sync.8 b/sys/contrib/openzfs/man/man8/zpool-sync.8
index 8f438f363e83..d1dc05d0c202 100644
--- a/sys/contrib/openzfs/man/man8/zpool-sync.8
+++ b/sys/contrib/openzfs/man/man8/zpool-sync.8
@@ -1,54 +1,54 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd July 11, 2022
 .Dt ZPOOL-SYNC 8
 .Os
 .
 .Sh NAME
 .Nm zpool-sync
 .Nd flush data to primary storage of ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm sync
 .Oo Ar pool Oc Ns …
 .
 .Sh DESCRIPTION
 This command forces all in-core dirty data to be written to the primary
 pool storage and not the ZIL.
 It will also update administrative information including quota reporting.
 Without arguments,
 .Nm zpool Cm sync
 will sync all pools on the system.
 Otherwise, it will sync only the specified pools.
 .
 .Sh SEE ALSO
 .Xr zpoolconcepts 7 ,
 .Xr zpool-export 8 ,
 .Xr zpool-iostat 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-trim.8 b/sys/contrib/openzfs/man/man8/zpool-trim.8
index 18723e1be0d2..c4e849019789 100644
--- a/sys/contrib/openzfs/man/man8/zpool-trim.8
+++ b/sys/contrib/openzfs/man/man8/zpool-trim.8
@@ -1,118 +1,118 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2025 Hewlett Packard Enterprise Development LP.
 .\"
-.Dd May 27, 2021
+.Dd July 30, 2025
 .Dt ZPOOL-TRIM 8
 .Os
 .
 .Sh NAME
 .Nm zpool-trim
 .Nd initiate TRIM of free space in ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm trim
 .Op Fl dw
 .Op Fl r Ar rate
 .Op Fl c Ns | Ns Fl s
 .Fl a Ns | Ns Ar pool
 .Oo Ar device Ns Oc Ns …
 .
 .Sh DESCRIPTION
 Initiates an immediate on-demand TRIM operation for all of the free space in
 a pool.
 This operation informs the underlying storage devices of all blocks
 in the pool which are no longer allocated and allows thinly provisioned
 devices to reclaim the space.
 .Pp
 A manual on-demand TRIM operation can be initiated irrespective of the
 .Sy autotrim
 pool property setting.
 See the documentation for the
 .Sy autotrim
 property above for the types of vdev devices which can be trimmed.
 .Bl -tag -width Ds
 .It Fl a , -all
 Perform TRIM operation on
 all
 pools.
 .It Fl d , -secure
 Causes a secure TRIM to be initiated.
 When performing a secure TRIM, the
 device guarantees that data stored on the trimmed blocks has been erased.
 This requires support from the device and is not supported by all SSDs.
 .It Fl r , -rate Ar rate
 Controls the rate at which the TRIM operation progresses.
 Without this
 option TRIM is executed as quickly as possible.
 The rate, expressed in bytes
 per second, is applied on a per-vdev basis and may be set differently for
 each leaf vdev.
 .It Fl c , -cancel
 Cancel trimming on the specified devices, or all eligible devices if none
 are specified.
 If one or more target devices are invalid or are not currently being
 trimmed, the command will fail and no cancellation will occur on any device.
 .It Fl s , -suspend
 Suspend trimming on the specified devices, or all eligible devices if none
 are specified.
 If one or more target devices are invalid or are not currently being
 trimmed, the command will fail and no suspension will occur on any device.
 Trimming can then be resumed by running
 .Nm zpool Cm trim
 with no flags on the relevant target devices.
 .It Fl w , -wait
 Wait until the devices are done being trimmed before returning.
 .El
 .Sh PERIODIC TRIM
 On machines using systemd, trim timers can be enabled on a per-pool basis.
 .Nm weekly
 and
 .Nm monthly
 timer units are provided.
 .Bl -tag -width Ds
 .It Xo
 .Xc
 .Nm systemctl
 .Cm enable
 .Cm zfs-trim-\fIweekly\fB@\fIrpool\fB.timer
 .Cm --now
 .It Xo
 .Xc
 .Nm systemctl
 .Cm enable
 .Cm zfs-trim-\fImonthly\fB@\fIotherpool\fB.timer
 .Cm --now
 .El
 .
 .Sh SEE ALSO
 .Xr systemd.timer 5 ,
 .Xr zpoolprops 7 ,
 .Xr zpool-initialize 8 ,
 .Xr zpool-wait 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-upgrade.8 b/sys/contrib/openzfs/man/man8/zpool-upgrade.8
index 20632ae4bba0..cf69060da5ce 100644
--- a/sys/contrib/openzfs/man/man8/zpool-upgrade.8
+++ b/sys/contrib/openzfs/man/man8/zpool-upgrade.8
@@ -1,122 +1,122 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\" Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
 .\"
-.Dd March 16, 2022
+.Dd July 11, 2022
 .Dt ZPOOL-UPGRADE 8
 .Os
 .
 .Sh NAME
 .Nm zpool-upgrade
 .Nd manage version and feature flags of ZFS storage pools
 .Sh SYNOPSIS
 .Nm zpool
 .Cm upgrade
 .Nm zpool
 .Cm upgrade
 .Fl v
 .Nm zpool
 .Cm upgrade
 .Op Fl V Ar version
 .Fl a Ns | Ns Ar pool Ns …
 .
 .Sh DESCRIPTION
 .Bl -tag -width Ds
 .It Xo
 .Nm zpool
 .Cm upgrade
 .Xc
 Displays pools which do not have all supported features enabled and pools
 formatted using a legacy ZFS version number.
 These pools can continue to be used, but some features may not be available.
 Use
 .Nm zpool Cm upgrade Fl a
 to enable all features on all pools (subject to the
 .Fl o Sy compatibility
 property).
 .It Xo
 .Nm zpool
 .Cm upgrade
 .Fl v
 .Xc
 Displays legacy ZFS versions supported by the this version of ZFS.
 See
 .Xr zpool-features 7
 for a description of feature flags features supported by this version of ZFS.
 .It Xo
 .Nm zpool
 .Cm upgrade
 .Op Fl V Ar version
 .Fl a Ns | Ns Ar pool Ns …
 .Xc
 Enables all supported features on the given pool.
 .Pp
 If the pool has specified compatibility feature sets using the
 .Fl o Sy compatibility
 property, only the features present in all requested compatibility sets will be
 enabled.
 If this property is set to
 .Ar legacy
 then no upgrade will take place.
 .Pp
 Once this is done, the pool will no longer be accessible on systems that do not
 support feature flags.
 See
 .Xr zpool-features 7
 for details on compatibility with systems that support feature flags, but do not
 support all features enabled on the pool.
 .Bl -tag -width Ds
 .It Fl a
 Enables all supported features (from specified compatibility sets, if any) on
 all
 pools.
 .It Fl V Ar version
 Upgrade to the specified legacy version.
 If specified, no features will be enabled on the pool.
 This option can only be used to increase the version number up to the last
 supported legacy version number.
 .El
 .El
 .
 .Sh EXAMPLES
 .\" These are, respectively, examples 10 from zpool.8
 .\" Make sure to update them bidirectionally
 .Ss Example 1 : No Upgrading All ZFS Storage Pools to the Current Version
 The following command upgrades all ZFS Storage pools to the current version of
 the software:
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm upgrade Fl a
 This system is currently running ZFS version 2.
 .Ed
 .
 .Sh SEE ALSO
 .Xr zpool-features 7 ,
 .Xr zpoolconcepts 7 ,
 .Xr zpoolprops 7 ,
 .Xr zpool-history 8
diff --git a/sys/contrib/openzfs/man/man8/zpool-wait.8 b/sys/contrib/openzfs/man/man8/zpool-wait.8
index 0ffb4badfb7b..28a51d29a913 100644
--- a/sys/contrib/openzfs/man/man8/zpool-wait.8
+++ b/sys/contrib/openzfs/man/man8/zpool-wait.8
@@ -1,119 +1,119 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2021 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd May 27, 2021
+.Dd January 29, 2024
 .Dt ZPOOL-WAIT 8
 .Os
 .
 .Sh NAME
 .Nm zpool-wait
 .Nd wait for activity to stop in a ZFS storage pool
 .Sh SYNOPSIS
 .Nm zpool
 .Cm wait
 .Op Fl Hp
 .Op Fl T Sy u Ns | Ns Sy d
 .Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns …
 .Ar pool
 .Op Ar interval
 .
 .Sh DESCRIPTION
 Waits until all background activity of the given types has ceased in the given
 pool.
 The activity could cease because it has completed, or because it has been
 paused or canceled by a user, or because the pool has been exported or
 destroyed.
 If no activities are specified, the command waits until background activity of
 every type listed below has ceased.
 If there is no activity of the given types in progress, the command returns
 immediately.
 .Pp
 These are the possible values for
 .Ar activity ,
 along with what each one waits for:
 .Bl -tag -compact -offset Ds -width "raidz_expand"
 .It Sy discard
 Checkpoint to be discarded
 .It Sy free
 .Sy freeing
 property to become
 .Sy 0
 .It Sy initialize
 All initializations to cease
 .It Sy replace
 All device replacements to cease
 .It Sy remove
 Device removal to cease
 .It Sy resilver
 Resilver to cease
 .It Sy scrub
 Scrub to cease
 .It Sy trim
 Manual trim to cease
 .It Sy raidz_expand
 Attaching to a RAID-Z vdev to complete
 .El
 .Pp
 If an
 .Ar interval
 is provided, the amount of work remaining, in bytes, for each activity is
 printed every
 .Ar interval
 seconds.
 .Bl -tag -width Ds
 .It Fl H
 Scripted mode.
 Do not display headers, and separate fields by a single tab instead of arbitrary
 space.
 .It Fl p
 Display numbers in parsable (exact) values.
 .It Fl T Sy u Ns | Ns Sy d
 Display a time stamp.
 Specify
 .Sy u
 for a printed representation of the internal representation of time.
 See
 .Xr time 1 .
 Specify
 .Sy d
 for standard date format.
 See
 .Xr date 1 .
 .El
 .
 .Sh SEE ALSO
 .Xr zpool-checkpoint 8 ,
 .Xr zpool-initialize 8 ,
 .Xr zpool-remove 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-resilver 8 ,
 .Xr zpool-scrub 8 ,
 .Xr zpool-status 8 ,
 .Xr zpool-trim 8
diff --git a/sys/contrib/openzfs/man/man8/zpool.8 b/sys/contrib/openzfs/man/man8/zpool.8
index b96944050594..3bfef780b298 100644
--- a/sys/contrib/openzfs/man/man8/zpool.8
+++ b/sys/contrib/openzfs/man/man8/zpool.8
@@ -1,657 +1,657 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
 .\" Copyright (c) 2012, 2018 by Delphix. All rights reserved.
 .\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved.
 .\" Copyright (c) 2017 Datto Inc.
 .\" Copyright (c) 2018 George Melikov. All Rights Reserved.
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd February 14, 2024
+.Dd November 19, 2024
 .Dt ZPOOL 8
 .Os
 .
 .Sh NAME
 .Nm zpool
 .Nd configure ZFS storage pools
 .Sh SYNOPSIS
 .Nm
 .Fl ?V
 .Nm
 .Cm version
 .Op Fl j
 .Nm
 .Cm subcommand
 .Op Ar arguments
 .
 .Sh DESCRIPTION
 The
 .Nm
 command configures ZFS storage pools.
 A storage pool is a collection of devices that provides physical storage and
 data replication for ZFS datasets.
 All datasets within a storage pool share the same space.
 See
 .Xr zfs 8
 for information on managing datasets.
 .Pp
 For an overview of creating and managing ZFS storage pools see the
 .Xr zpoolconcepts 7
 manual page.
 .
 .Sh SUBCOMMANDS
 All subcommands that modify state are logged persistently to the pool in their
 original form.
 .Pp
 The
 .Nm
 command provides subcommands to create and destroy storage pools, add capacity
 to storage pools, and provide information about the storage pools.
 The following subcommands are supported:
 .Bl -tag -width Ds
 .It Xo
 .Nm
 .Fl ?\&
 .Xc
 Displays a help message.
 .It Xo
 .Nm
 .Fl V , -version
 .Xc
 .It Xo
 .Nm
 .Cm version
 .Op Fl j
 .Xc
 Displays the software version of the
 .Nm
 userland utility and the ZFS kernel module.
 Use
 .Fl j
 option to output in JSON format.
 .El
 .
 .Ss Creation
 .Bl -tag -width Ds
 .It Xr zpool-create 8
 Creates a new storage pool containing the virtual devices specified on the
 command line.
 .It Xr zpool-initialize 8
 Begins initializing by writing to all unallocated regions on the specified
 devices, or all eligible devices in the pool if no individual devices are
 specified.
 .El
 .
 .Ss Destruction
 .Bl -tag -width Ds
 .It Xr zpool-destroy 8
 Destroys the given pool, freeing up any devices for other use.
 .It Xr zpool-labelclear 8
 Removes ZFS label information from the specified
 .Ar device .
 .El
 .
 .Ss Virtual Devices
 .Bl -tag -width Ds
 .It Xo
 .Xr zpool-attach 8 Ns / Ns Xr zpool-detach 8
 .Xc
 Converts a non-redundant disk into a mirror, or increases
 the redundancy level of an existing mirror
 .Cm ( attach Ns ), or performs the inverse operation (
 .Cm detach Ns ).
 .It Xo
 .Xr zpool-add 8 Ns / Ns Xr zpool-remove 8
 .Xc
 Adds the specified virtual devices to the given pool,
 or removes the specified device from the pool.
 .It Xr zpool-replace 8
 Replaces an existing device (which may be faulted) with a new one.
 .It Xr zpool-split 8
 Creates a new pool by splitting all mirrors in an existing pool (which decreases
 its redundancy).
 .El
 .
 .Ss Properties
 Available pool properties listed in the
 .Xr zpoolprops 7
 manual page.
 .Bl -tag -width Ds
 .It Xr zpool-list 8
 Lists the given pools along with a health status and space usage.
 .It Xo
 .Xr zpool-get 8 Ns / Ns Xr zpool-set 8
 .Xc
 Retrieves the given list of properties
 .Po
 or all properties if
 .Sy all
 is used
 .Pc
 for the specified storage pool(s).
 .El
 .
 .Ss Monitoring
 .Bl -tag -width Ds
 .It Xr zpool-status 8
 Displays the detailed health status for the given pools.
 .It Xr zpool-iostat 8
 Displays logical I/O statistics for the given pools/vdevs.
 Physical I/O operations may be observed via
 .Xr iostat 1 .
 .It Xr zpool-events 8
 Lists all recent events generated by the ZFS kernel modules.
 These events are consumed by the
 .Xr zed 8
 and used to automate administrative tasks such as replacing a failed device
 with a hot spare.
 That manual page also describes the subclasses and event payloads
 that can be generated.
 .It Xr zpool-history 8
 Displays the command history of the specified pool(s) or all pools if no pool is
 specified.
 .El
 .
 .Ss Maintenance
 .Bl -tag -width Ds
 .It Xr zpool-prefetch 8
 Prefetches specific types of pool data.
 .It Xr zpool-scrub 8
 Begins a scrub or resumes a paused scrub.
 .It Xr zpool-checkpoint 8
 Checkpoints the current state of
 .Ar pool ,
 which can be later restored by
 .Nm zpool Cm import Fl -rewind-to-checkpoint .
 .It Xr zpool-trim 8
 Initiates an immediate on-demand TRIM operation for all of the free space in a
 pool.
 This operation informs the underlying storage devices of all blocks
 in the pool which are no longer allocated and allows thinly provisioned
 devices to reclaim the space.
 .It Xr zpool-sync 8
 This command forces all in-core dirty data to be written to the primary
 pool storage and not the ZIL.
 It will also update administrative information including quota reporting.
 Without arguments,
 .Nm zpool Cm sync
 will sync all pools on the system.
 Otherwise, it will sync only the specified pool(s).
 .It Xr zpool-upgrade 8
 Manage the on-disk format version of storage pools.
 .It Xr zpool-wait 8
 Waits until all background activity of the given types has ceased in the given
 pool.
 .El
 .
 .Ss Fault Resolution
 .Bl -tag -width Ds
 .It Xo
 .Xr zpool-offline 8 Ns / Ns Xr zpool-online 8
 .Xc
 Takes the specified physical device offline or brings it online.
 .It Xr zpool-resilver 8
 Starts a resilver.
 If an existing resilver is already running it will be restarted from the
 beginning.
 .It Xr zpool-reopen 8
 Reopen all the vdevs associated with the pool.
 .It Xr zpool-clear 8
 Clears device errors in a pool.
 .El
 .
 .Ss Import & Export
 .Bl -tag -width Ds
 .It Xr zpool-import 8
 Make disks containing ZFS storage pools available for use on the system.
 .It Xr zpool-export 8
 Exports the given pools from the system.
 .It Xr zpool-reguid 8
 Generates a new unique identifier for the pool.
 .El
 .
 .Sh EXIT STATUS
 The following exit values are returned:
 .Bl -tag -compact -offset 4n -width "a"
 .It Sy 0
 Successful completion.
 .It Sy 1
 An error occurred.
 .It Sy 2
 Invalid command line options were specified.
 .El
 .
 .Sh EXAMPLES
 .\" Examples 1, 2, 3, 4, 12, 13 are shared with zpool-create.8.
 .\" Examples 6, 14 are shared with zpool-add.8.
 .\" Examples 7, 16 are shared with zpool-list.8.
 .\" Examples 8 are shared with zpool-destroy.8.
 .\" Examples 9 are shared with zpool-export.8.
 .\" Examples 10 are shared with zpool-import.8.
 .\" Examples 11 are shared with zpool-upgrade.8.
 .\" Examples 15 are shared with zpool-remove.8.
 .\" Examples 17 are shared with zpool-status.8.
 .\" Examples 14, 17 are also shared with zpool-iostat.8.
 .\" Make sure to update them omnidirectionally
 .Ss Example 1 : No Creating a RAID-Z Storage Pool
 The following command creates a pool with a single raidz root vdev that
 consists of six disks:
 .Dl # Nm zpool Cm create Ar tank Sy raidz Pa sda sdb sdc sdd sde sdf
 .
 .Ss Example 2 : No Creating a Mirrored Storage Pool
 The following command creates a pool with two mirrors, where each mirror
 contains two disks:
 .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy mirror Pa sdc sdd
 .
 .Ss Example 3 : No Creating a ZFS Storage Pool by Using Partitions
 The following command creates a non-redundant pool using two disk partitions:
 .Dl # Nm zpool Cm create Ar tank Pa sda1 sdb2
 .
 .Ss Example 4 : No Creating a ZFS Storage Pool by Using Files
 The following command creates a non-redundant pool using files.
 While not recommended, a pool based on files can be useful for experimental
 purposes.
 .Dl # Nm zpool Cm create Ar tank Pa /path/to/file/a /path/to/file/b
 .
 .Ss Example 5 : No Making a non-mirrored ZFS Storage Pool mirrored
 The following command converts an existing single device
 .Ar sda
 into a mirror by attaching a second device to it,
 .Ar sdb .
 .Dl # Nm zpool Cm attach Ar tank Pa sda sdb
 .
 .Ss Example 6 : No Adding a Mirror to a ZFS Storage Pool
 The following command adds two mirrored disks to the pool
 .Ar tank ,
 assuming the pool is already made up of two-way mirrors.
 The additional space is immediately available to any datasets within the pool.
 .Dl # Nm zpool Cm add Ar tank Sy mirror Pa sda sdb
 .
 .Ss Example 7 : No Listing Available ZFS Storage Pools
 The following command lists all available pools on the system.
 In this case, the pool
 .Ar zion
 is faulted due to a missing device.
 The results from this command are similar to the following:
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm list
 NAME    SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
 rpool  19.9G  8.43G  11.4G         -    33%    42%  1.00x  ONLINE  -
 tank   61.5G  20.0G  41.5G         -    48%    32%  1.00x  ONLINE  -
 zion       -      -      -         -      -      -      -  FAULTED -
 .Ed
 .
 .Ss Example 8 : No Destroying a ZFS Storage Pool
 The following command destroys the pool
 .Ar tank
 and any datasets contained within:
 .Dl # Nm zpool Cm destroy Fl f Ar tank
 .
 .Ss Example 9 : No Exporting a ZFS Storage Pool
 The following command exports the devices in pool
 .Ar tank
 so that they can be relocated or later imported:
 .Dl # Nm zpool Cm export Ar tank
 .
 .Ss Example 10 : No Importing a ZFS Storage Pool
 The following command displays available pools, and then imports the pool
 .Ar tank
 for use on the system.
 The results from this command are similar to the following:
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm import
   pool: tank
     id: 15451357997522795478
  state: ONLINE
 action: The pool can be imported using its name or numeric identifier.
 config:
 
         tank        ONLINE
           mirror    ONLINE
             sda     ONLINE
             sdb     ONLINE
 
 .No # Nm zpool Cm import Ar tank
 .Ed
 .
 .Ss Example 11 : No Upgrading All ZFS Storage Pools to the Current Version
 The following command upgrades all ZFS Storage pools to the current version of
 the software:
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm upgrade Fl a
 This system is currently running ZFS version 2.
 .Ed
 .
 .Ss Example 12 : No Managing Hot Spares
 The following command creates a new pool with an available hot spare:
 .Dl # Nm zpool Cm create Ar tank Sy mirror Pa sda sdb Sy spare Pa sdc
 .Pp
 If one of the disks were to fail, the pool would be reduced to the degraded
 state.
 The failed device can be replaced using the following command:
 .Dl # Nm zpool Cm replace Ar tank Pa sda sdd
 .Pp
 Once the data has been resilvered, the spare is automatically removed and is
 made available for use should another device fail.
 The hot spare can be permanently removed from the pool using the following
 command:
 .Dl # Nm zpool Cm remove Ar tank Pa sdc
 .
 .Ss Example 13 : No Creating a ZFS Pool with Mirrored Separate Intent Logs
 The following command creates a ZFS storage pool consisting of two, two-way
 mirrors and mirrored log devices:
 .Dl # Nm zpool Cm create Ar pool Sy mirror Pa sda sdb Sy mirror Pa sdc sdd Sy log mirror Pa sde sdf
 .
 .Ss Example 14 : No Adding Cache Devices to a ZFS Pool
 The following command adds two disks for use as cache devices to a ZFS storage
 pool:
 .Dl # Nm zpool Cm add Ar pool Sy cache Pa sdc sdd
 .Pp
 Once added, the cache devices gradually fill with content from main memory.
 Depending on the size of your cache devices, it could take over an hour for
 them to fill.
 Capacity and reads can be monitored using the
 .Cm iostat
 subcommand as follows:
 .Dl # Nm zpool Cm iostat Fl v Ar pool 5
 .
 .Ss Example 15 : No Removing a Mirrored top-level (Log or Data) Device
 The following commands remove the mirrored log device
 .Sy mirror-2
 and mirrored top-level data device
 .Sy mirror-1 .
 .Pp
 Given this configuration:
 .Bd -literal -compact -offset Ds
   pool: tank
  state: ONLINE
  scrub: none requested
 config:
 
          NAME        STATE     READ WRITE CKSUM
          tank        ONLINE       0     0     0
            mirror-0  ONLINE       0     0     0
              sda     ONLINE       0     0     0
              sdb     ONLINE       0     0     0
            mirror-1  ONLINE       0     0     0
              sdc     ONLINE       0     0     0
              sdd     ONLINE       0     0     0
          logs
            mirror-2  ONLINE       0     0     0
              sde     ONLINE       0     0     0
              sdf     ONLINE       0     0     0
 .Ed
 .Pp
 The command to remove the mirrored log
 .Ar mirror-2 No is :
 .Dl # Nm zpool Cm remove Ar tank mirror-2
 .Pp
 At this point, the log device no longer exists
 (both sides of the mirror have been removed):
 .Bd -literal -compact -offset Ds
   pool: tank
  state: ONLINE
   scan: none requested
 config:
 
         NAME        STATE     READ WRITE CKSUM
         tank        ONLINE       0     0     0
           mirror-0  ONLINE       0     0     0
             sda     ONLINE       0     0     0
             sdb     ONLINE       0     0     0
           mirror-1  ONLINE       0     0     0
             sdc     ONLINE       0     0     0
             sdd     ONLINE       0     0     0
 .Ed
 .Pp
 The command to remove the mirrored data
 .Ar mirror-1 No is :
 .Dl # Nm zpool Cm remove Ar tank mirror-1
 .Pp
 After
 .Ar mirror-1 No has been evacuated, the pool remains redundant, but
 the total amount of space is reduced:
 .Bd -literal -compact -offset Ds
   pool: tank
  state: ONLINE
   scan: none requested
 config:
 
         NAME        STATE     READ WRITE CKSUM
         tank        ONLINE       0     0     0
           mirror-0  ONLINE       0     0     0
             sda     ONLINE       0     0     0
             sdb     ONLINE       0     0     0
 .Ed
 .
 .Ss Example 16 : No Displaying expanded space on a device
 The following command displays the detailed information for the pool
 .Ar data .
 This pool is comprised of a single raidz vdev where one of its devices
 increased its capacity by 10 GiB.
 In this example, the pool will not be able to utilize this extra capacity until
 all the devices under the raidz vdev have been expanded.
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm list Fl v Ar data
 NAME         SIZE  ALLOC   FREE  EXPANDSZ   FRAG    CAP  DEDUP  HEALTH  ALTROOT
 data        23.9G  14.6G  9.30G         -    48%    61%  1.00x  ONLINE  -
   raidz1    23.9G  14.6G  9.30G         -    48%
     sda         -      -      -         -      -
     sdb         -      -      -       10G      -
     sdc         -      -      -         -      -
 .Ed
 .
 .Ss Example 17 : No Adding output columns
 Additional columns can be added to the
 .Nm zpool Cm status No and Nm zpool Cm iostat No output with Fl c .
 .Bd -literal -compact -offset Ds
 .No # Nm zpool Cm status Fl c Pa vendor , Ns Pa model , Ns Pa size
    NAME     STATE  READ WRITE CKSUM vendor  model        size
    tank     ONLINE 0    0     0
    mirror-0 ONLINE 0    0     0
    U1       ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U10      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U11      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U12      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U13      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
    U14      ONLINE 0    0     0     SEAGATE ST8000NM0075 7.3T
 
 .No # Nm zpool Cm iostat Fl vc Pa size
               capacity     operations     bandwidth
 pool        alloc   free   read  write   read  write  size
 ----------  -----  -----  -----  -----  -----  -----  ----
 rpool       14.6G  54.9G      4     55   250K  2.69M
   sda1      14.6G  54.9G      4     55   250K  2.69M   70G
 ----------  -----  -----  -----  -----  -----  -----  ----
 .Ed
 .
 .Sh ENVIRONMENT VARIABLES
 .Bl -tag -compact -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE"
 .It Sy ZFS_ABORT
 Cause
 .Nm
 to dump core on exit for the purposes of running
 .Sy ::findleaks .
 .It Sy ZFS_COLOR
 Use ANSI color in
 .Nm zpool Cm status
 and
 .Nm zpool Cm iostat
 output.
 .It Sy ZPOOL_AUTO_POWER_ON_SLOT
 Automatically attempt to turn on the drives enclosure slot power to a drive when
 running the
 .Nm zpool Cm online
 or
 .Nm zpool Cm clear
 commands.
 This has the same effect as passing the
 .Fl -power
 option to those commands.
 .It Sy ZPOOL_POWER_ON_SLOT_TIMEOUT_MS
 The maximum time in milliseconds to wait for a slot power sysfs value
 to return the correct value after writing it.
 For example, after writing "on" to the sysfs enclosure slot power_control file,
 it can take some time for the enclosure to power down the slot and return
 "on" if you read back the 'power_control' value.
 Defaults to 30 seconds (30000ms) if not set.
 .It Sy ZPOOL_IMPORT_PATH
 The search path for devices or files to use with the pool.
 This is a colon-separated list of directories in which
 .Nm
 looks for device nodes and files.
 Similar to the
 .Fl d
 option in
 .Nm zpool import .
 .It Sy ZPOOL_IMPORT_UDEV_TIMEOUT_MS
 The maximum time in milliseconds that
 .Nm zpool import
 will wait for an expected device to be available.
 .It Sy ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE
 If set, suppress warning about non-native vdev ashift in
 .Nm zpool Cm status .
 The value is not used, only the presence or absence of the variable matters.
 .It Sy ZPOOL_VDEV_NAME_GUID
 Cause
 .Nm
 subcommands to output vdev GUIDs by default.
 This behavior is identical to the
 .Nm zpool Cm status Fl g
 command line option.
 .It Sy ZPOOL_VDEV_NAME_FOLLOW_LINKS
 Cause
 .Nm
 subcommands to follow links for vdev names by default.
 This behavior is identical to the
 .Nm zpool Cm status Fl L
 command line option.
 .It Sy ZPOOL_VDEV_NAME_PATH
 Cause
 .Nm
 subcommands to output full vdev path names by default.
 This behavior is identical to the
 .Nm zpool Cm status Fl P
 command line option.
 .It Sy ZFS_VDEV_DEVID_OPT_OUT
 Older OpenZFS implementations had issues when attempting to display pool
 config vdev names if a
 .Sy devid
 NVP value is present in the pool's config.
 .Pp
 For example, a pool that originated on illumos platform would have a
 .Sy devid
 value in the config and
 .Nm zpool Cm status
 would fail when listing the config.
 This would also be true for future Linux-based pools.
 .Pp
 A pool can be stripped of any
 .Sy devid
 values on import or prevented from adding
 them on
 .Nm zpool Cm create
 or
 .Nm zpool Cm add
 by setting
 .Sy ZFS_VDEV_DEVID_OPT_OUT .
 .Pp
 .It Sy ZPOOL_SCRIPTS_AS_ROOT
 Allow a privileged user to run
 .Nm zpool Cm status Ns / Ns Cm iostat Fl c .
 Normally, only unprivileged users are allowed to run
 .Fl c .
 .It Sy ZPOOL_SCRIPTS_PATH
 The search path for scripts when running
 .Nm zpool Cm status Ns / Ns Cm iostat Fl c .
 This is a colon-separated list of directories and overrides the default
 .Pa ~/.zpool.d
 and
 .Pa /etc/zfs/zpool.d
 search paths.
 .It Sy ZPOOL_SCRIPTS_ENABLED
 Allow a user to run
 .Nm zpool Cm status Ns / Ns Cm iostat Fl c .
 If
 .Sy ZPOOL_SCRIPTS_ENABLED
 is not set, it is assumed that the user is allowed to run
 .Nm zpool Cm status Ns / Ns Cm iostat Fl c .
 .\" Shared with zfs.8
 .It Sy ZFS_MODULE_TIMEOUT
 Time, in seconds, to wait for
 .Pa /dev/zfs
 to appear.
 Defaults to
 .Sy 10 ,
 max
 .Sy 600 Pq 10 minutes .
 If
 .Pf < Sy 0 ,
 wait forever; if
 .Sy 0 ,
 don't wait.
 .El
 .
 .Sh INTERFACE STABILITY
 .Sy Evolving
 .
 .Sh SEE ALSO
 .Xr zfs 4 ,
 .Xr zpool-features 7 ,
 .Xr zpoolconcepts 7 ,
 .Xr zpoolprops 7 ,
 .Xr zed 8 ,
 .Xr zfs 8 ,
 .Xr zpool-add 8 ,
 .Xr zpool-attach 8 ,
 .Xr zpool-checkpoint 8 ,
 .Xr zpool-clear 8 ,
 .Xr zpool-create 8 ,
 .Xr zpool-ddtprune 8 ,
 .Xr zpool-destroy 8 ,
 .Xr zpool-detach 8 ,
 .Xr zpool-events 8 ,
 .Xr zpool-export 8 ,
 .Xr zpool-get 8 ,
 .Xr zpool-history 8 ,
 .Xr zpool-import 8 ,
 .Xr zpool-initialize 8 ,
 .Xr zpool-iostat 8 ,
 .Xr zpool-labelclear 8 ,
 .Xr zpool-list 8 ,
 .Xr zpool-offline 8 ,
 .Xr zpool-online 8 ,
 .Xr zpool-prefetch 8 ,
 .Xr zpool-reguid 8 ,
 .Xr zpool-remove 8 ,
 .Xr zpool-reopen 8 ,
 .Xr zpool-replace 8 ,
 .Xr zpool-resilver 8 ,
 .Xr zpool-scrub 8 ,
 .Xr zpool-set 8 ,
 .Xr zpool-split 8 ,
 .Xr zpool-status 8 ,
 .Xr zpool-sync 8 ,
 .Xr zpool-trim 8 ,
 .Xr zpool-upgrade 8 ,
 .Xr zpool-wait 8
diff --git a/sys/contrib/openzfs/man/man8/zstream.8 b/sys/contrib/openzfs/man/man8/zstream.8
index 03a8479c9e6a..5b3d063bc4a5 100644
--- a/sys/contrib/openzfs/man/man8/zstream.8
+++ b/sys/contrib/openzfs/man/man8/zstream.8
@@ -1,200 +1,200 @@
 .\" SPDX-License-Identifier: CDDL-1.0
 .\"
 .\" CDDL HEADER START
 .\"
 .\" The contents of this file are subject to the terms of the
 .\" Common Development and Distribution License (the "License").
 .\" You may not use this file except in compliance with the License.
 .\"
 .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 .\" or https://opensource.org/licenses/CDDL-1.0.
 .\" See the License for the specific language governing permissions
 .\" and limitations under the License.
 .\"
 .\" When distributing Covered Code, include this CDDL HEADER in each
 .\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 .\" If applicable, add the following below this CDDL HEADER, with the
 .\" fields enclosed by brackets "[]" replaced with your own identifying
 .\" information: Portions Copyright [yyyy] [name of copyright owner]
 .\"
 .\" CDDL HEADER END
 .\"
 .\" Copyright (c) 2020 by Delphix. All rights reserved.
 .\"
-.Dd October 4, 2022
+.Dd November 10, 2022
 .Dt ZSTREAM 8
 .Os
 .
 .Sh NAME
 .Nm zstream
 .Nd manipulate ZFS send streams
 .Sh SYNOPSIS
 .Nm
 .Cm dump
 .Op Fl Cvd
 .Op Ar file
 .Nm
 .Cm decompress
 .Op Fl v
 .Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \&, Ns Ar type Ns ...
 .Nm
 .Cm redup
 .Op Fl v
 .Ar file
 .Nm
 .Cm token
 .Ar resume_token
 .Nm
 .Cm recompress
 .Op Fl l Ar level
 .Ar algorithm
 .
 .Sh DESCRIPTION
 The
 .Sy zstream
 utility manipulates ZFS send streams output by the
 .Sy zfs send
 command.
 .Bl -tag -width ""
 .It Xo
 .Nm
 .Cm dump
 .Op Fl Cvd
 .Op Ar file
 .Xc
 Print information about the specified send stream, including headers and
 record counts.
 The send stream may either be in the specified
 .Ar file ,
 or provided on standard input.
 .Bl -tag -width "-D"
 .It Fl C
 Suppress the validation of checksums.
 .It Fl v
 Verbose.
 Print metadata for each record.
 .It Fl d
 Dump data contained in each record.
 Implies verbose.
 .El
 .Pp
 The
 .Nm zstreamdump
 alias is provided for compatibility and is equivalent to running
 .Nm
 .Cm dump .
 .It Xo
 .Nm
 .Cm token
 .Ar resume_token
 .Xc
 Dumps zfs resume token information
 .It Xo
 .Nm
 .Cm decompress
 .Op Fl v
 .Op Ar object Ns Sy \&, Ns Ar offset Ns Op Sy \&, Ns Ar type Ns ...
 .Xc
 Decompress selected records in a ZFS send stream provided on standard input,
 when the compression type recorded in ZFS metadata may be incorrect.
 Specify the object number and byte offset of each record that you wish to
 decompress.
 Optionally specify the compression type.
 Valid compression types include
 .Sy off ,
 .Sy gzip ,
 .Sy lz4 ,
 .Sy lzjb ,
 .Sy zstd ,
 and
 .Sy zle .
 The default is
 .Sy lz4 .
 Every record for that object beginning at that offset will be decompressed, if
 possible.
 It may not be possible, because the record may be corrupted in some but not
 all of the stream's snapshots.
 Specifying a compression type of
 .Sy off
 will change the stream's metadata accordingly, without attempting decompression.
 This can be useful if the record is already uncompressed but the metadata
 insists otherwise.
 The repaired stream will be written to standard output.
 .Bl -tag -width "-v"
 .It Fl v
 Verbose.
 Print summary of decompressed records.
 .El
 .It Xo
 .Nm
 .Cm redup
 .Op Fl v
 .Ar file
 .Xc
 Deduplicated send streams can be generated by using the
 .Nm zfs Cm send Fl D
 command.
 The ability to send deduplicated send streams is deprecated.
 In the future, the ability to receive a deduplicated send stream with
 .Nm zfs Cm receive
 will be removed.
 However, deduplicated send streams can still be received by utilizing
 .Nm zstream Cm redup .
 .Pp
 The
 .Nm zstream Cm redup
 command is provided a
 .Ar file
 containing a deduplicated send stream, and outputs an equivalent
 non-deduplicated send stream on standard output.
 Therefore, a deduplicated send stream can be received by running:
 .Dl # Nm zstream Cm redup Pa DEDUP_STREAM_FILE | Nm zfs Cm receive No …
 .Bl -tag -width "-D"
 .It Fl v
 Verbose.
 Print summary of converted records.
 .El
 .It Xo
 .Nm
 .Cm recompress
 .Op Fl l Ar level
 .Ar algorithm
 .Xc
 Recompresses a send stream, provided on standard input, using the provided
 algorithm and optional level, and writes the modified stream to standard output.
 All WRITE records in the send stream will be recompressed, unless they fail
 to result in size reduction compared to being left uncompressed.
 The provided algorithm can be any valid value to the
 .Nm compress
 property.
 Note that encrypted send streams cannot be recompressed.
 .Bl -tag -width "-l"
 .It Fl l Ar level
 Specifies compression level.
 Only needed for algorithms where the level is not implied as part of the name
 of the algorithm (e.g. gzip-3 does not require it, while zstd does, if a
 non-default level is desired).
 .El
 .El
 .
 .Sh EXAMPLES
 Heal a dataset that was corrupted due to OpenZFS bug #12762.
 First, determine which records are corrupt.
 That cannot be done automatically; it requires information beyond ZFS's
 metadata.
 If object
 .Sy 128
 is corrupted at offset
 .Sy 0
 and is compressed using
 .Sy lz4 ,
 then run this command:
 .Bd -literal
 .No # Nm zfs Ar send Fl c Ar … | Nm zstream decompress Ar 128,0,lz4 | \
 Nm zfs recv Ar …
 .Ed
 .Sh SEE ALSO
 .Xr zfs 8 ,
 .Xr zfs-receive 8 ,
 .Xr zfs-send 8 ,
 .Lk https://github.com/openzfs/zfs/issues/12762
diff --git a/sys/contrib/openzfs/module/Kbuild.in b/sys/contrib/openzfs/module/Kbuild.in
index 362d2295e091..58a80dc4402c 100644
--- a/sys/contrib/openzfs/module/Kbuild.in
+++ b/sys/contrib/openzfs/module/Kbuild.in
@@ -1,529 +1,529 @@
 # When integrated in to a monolithic kernel the spl module must appear
 # first.  This ensures its module initialization function is run before
 # any of the other module initialization functions which depend on it.
 
 ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement
 ZFS_MODULE_CFLAGS += -Wmissing-prototypes
-ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@  @NO_FORMAT_ZERO_LENGTH@
+ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @KERNEL_NO_FORMAT_ZERO_LENGTH@
 
 ifneq ($(KBUILD_EXTMOD),)
 zfs_include = @abs_top_srcdir@/include
 icp_include = @abs_srcdir@/icp/include
 zstd_include = @abs_srcdir@/zstd/include
 ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h
 ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include
 src = @abs_srcdir@
 obj = @abs_builddir@
 else
 zfs_include = $(srctree)/include/zfs
 icp_include = $(src)/icp/include
 zstd_include = $(src)/zstd/include
 ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h
 endif
 
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl
 ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs
 ZFS_MODULE_CFLAGS += -I$(zfs_include)
 ZFS_MODULE_CPPFLAGS += -D_KERNEL
 ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@
 
 # KASAN enables -Werror=frame-larger-than=1024, which
 # breaks oh so many parts of our build.
 ifeq ($(CONFIG_KASAN),y)
 ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than=
 endif
 
 # Generated binary search code is particularly bad with this optimization.
 # Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c
 # is not affected when unrolling is done.
 # Disable it until the following upstream issue is resolved:
 # https://github.com/llvm/llvm-project/issues/62790
 ifeq ($(CONFIG_X86),y)
 ifeq ($(CONFIG_CC_IS_CLANG),y)
 CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false
 CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false
 CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false
 CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false
 endif
 endif
 
 ifneq ($(KBUILD_EXTMOD),)
 @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include
 @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@
 endif
 
 asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
 ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS)
 
 ifeq ($(CONFIG_ARM64),y)
 CFLAGS_REMOVE_zcommon/zfs_fletcher_aarch64_neon.o += -mgeneral-regs-only
 CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neon.o += -mgeneral-regs-only
 CFLAGS_REMOVE_zfs/vdev_raidz_math_aarch64_neonx2.o += -mgeneral-regs-only
 endif
 
 # Suppress unused-value warnings in sparc64 architecture headers
 ccflags-$(CONFIG_SPARC64) += -Wno-unused-value
 
 
 obj-$(CONFIG_ZFS) := spl.o zfs.o
 
 SPL_OBJS := \
 	spl-atomic.o \
 	spl-condvar.o \
 	spl-cred.o \
 	spl-err.o \
 	spl-generic.o \
 	spl-kmem-cache.o \
 	spl-kmem.o \
 	spl-kstat.o \
 	spl-proc.o \
 	spl-procfs-list.o \
 	spl-shrinker.o \
 	spl-taskq.o \
 	spl-thread.o \
 	spl-trace.o \
 	spl-tsd.o \
 	spl-vmem.o \
 	spl-xdr.o \
 	spl-zlib.o \
 	spl-zone.o
 
 spl-objs += $(addprefix os/linux/spl/,$(SPL_OBJS))
 
 zfs-objs += avl/avl.o
 
 ICP_OBJS := \
 	algs/aes/aes_impl.o \
 	algs/aes/aes_impl_generic.o \
 	algs/aes/aes_modes.o \
 	algs/blake3/blake3.o \
 	algs/blake3/blake3_generic.o \
 	algs/blake3/blake3_impl.o \
 	algs/edonr/edonr.o \
 	algs/modes/ccm.o \
 	algs/modes/gcm.o \
 	algs/modes/gcm_generic.o \
 	algs/modes/modes.o \
 	algs/sha2/sha2_generic.o \
 	algs/sha2/sha256_impl.o \
 	algs/sha2/sha512_impl.o \
 	algs/skein/skein.o \
 	algs/skein/skein_block.o \
 	algs/skein/skein_iv.o \
 	api/kcf_cipher.o \
 	api/kcf_ctxops.o \
 	api/kcf_mac.o \
 	core/kcf_callprov.o \
 	core/kcf_mech_tabs.o \
 	core/kcf_prov_lib.o \
 	core/kcf_prov_tabs.o \
 	core/kcf_sched.o \
 	illumos-crypto.o \
 	io/aes.o \
 	io/sha2_mod.o \
 	spi/kcf_spi.o
 
 ICP_OBJS_X86_64 := \
 	asm-x86_64/aes/aes_aesni.o \
 	asm-x86_64/aes/aes_amd64.o \
 	asm-x86_64/aes/aeskey.o \
 	asm-x86_64/blake3/blake3_avx2.o \
 	asm-x86_64/blake3/blake3_avx512.o \
 	asm-x86_64/blake3/blake3_sse2.o \
 	asm-x86_64/blake3/blake3_sse41.o \
 	asm-x86_64/sha2/sha256-x86_64.o \
 	asm-x86_64/sha2/sha512-x86_64.o \
 	asm-x86_64/modes/aesni-gcm-x86_64.o \
 	asm-x86_64/modes/aesni-gcm-avx2-vaes.o \
 	asm-x86_64/modes/gcm_pclmulqdq.o \
 	asm-x86_64/modes/ghash-x86_64.o
 
 ICP_OBJS_X86 := \
 	algs/aes/aes_impl_aesni.o \
 	algs/aes/aes_impl_x86-64.o \
 	algs/modes/gcm_pclmulqdq.o
 
 ICP_OBJS_ARM := \
 	asm-arm/sha2/sha256-armv7.o \
 	asm-arm/sha2/sha512-armv7.o
 
 ICP_OBJS_ARM64 := \
 	asm-aarch64/blake3/b3_aarch64_sse2.o \
 	asm-aarch64/blake3/b3_aarch64_sse41.o \
 	asm-aarch64/sha2/sha256-armv8.o \
 	asm-aarch64/sha2/sha512-armv8.o
 
 ICP_OBJS_PPC_PPC64 := \
 	asm-ppc64/blake3/b3_ppc64le_sse2.o \
 	asm-ppc64/blake3/b3_ppc64le_sse41.o \
 	asm-ppc64/sha2/sha256-p8.o \
 	asm-ppc64/sha2/sha512-p8.o \
 	asm-ppc64/sha2/sha256-ppc.o \
 	asm-ppc64/sha2/sha512-ppc.o
 
 zfs-objs             += $(addprefix icp/,$(ICP_OBJS))
 zfs-$(CONFIG_X86)    += $(addprefix icp/,$(ICP_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix icp/,$(ICP_OBJS_X86))
 zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64))
 zfs-$(CONFIG_ARM)    += $(addprefix icp/,$(ICP_OBJS_ARM))
 zfs-$(CONFIG_ARM64)  += $(addprefix icp/,$(ICP_OBJS_ARM64))
 zfs-$(CONFIG_PPC)    += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
 zfs-$(CONFIG_PPC64)  += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
 
 $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
 	$(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include)
 
 $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
 	$(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) -I$(zfs_include)/os/linux/spl -I$(zfs_include)
 
 LUA_OBJS := \
 	lapi.o \
 	lauxlib.o \
 	lbaselib.o \
 	lcode.o \
 	lcompat.o \
 	lcorolib.o \
 	lctype.o \
 	ldebug.o \
 	ldo.o \
 	lfunc.o \
 	lgc.o \
 	llex.o \
 	lmem.o \
 	lobject.o \
 	lopcodes.o \
 	lparser.o \
 	lstate.o \
 	lstring.o \
 	lstrlib.o \
 	ltable.o \
 	ltablib.o \
 	ltm.o \
 	lvm.o \
 	lzio.o \
 	setjmp/setjmp.o
 
 zfs-objs += $(addprefix lua/,$(LUA_OBJS))
 
 
 NVPAIR_OBJS := \
 	fnvpair.o \
 	nvpair.o \
 	nvpair_alloc_fixed.o \
 	nvpair_alloc_spl.o
 
 zfs-objs += $(addprefix nvpair/,$(NVPAIR_OBJS))
 
 
 UNICODE_OBJS := \
 	u8_textprep.o
 
 zfs-objs += $(addprefix unicode/,$(UNICODE_OBJS))
 
 
 ZCOMMON_OBJS := \
 	cityhash.o \
 	simd_stat.o \
 	zfeature_common.o \
 	zfs_comutil.o \
 	zfs_deleg.o \
 	zfs_fletcher.o \
 	zfs_fletcher_superscalar.o \
 	zfs_fletcher_superscalar4.o \
 	zfs_namecheck.o \
 	zfs_prop.o \
 	zfs_valstr.o \
 	zpool_prop.o \
 	zprop_common.o
 
 ZCOMMON_OBJS_X86 := \
 	zfs_fletcher_avx512.o \
 	zfs_fletcher_intel.o \
 	zfs_fletcher_sse.o
 
 ZCOMMON_OBJS_ARM64 := \
 	zfs_fletcher_aarch64_neon.o
 
 zfs-objs            += $(addprefix zcommon/,$(ZCOMMON_OBJS))
 zfs-$(CONFIG_X86)   += $(addprefix zcommon/,$(ZCOMMON_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix zcommon/,$(ZCOMMON_OBJS_X86))
 zfs-$(CONFIG_ARM64) += $(addprefix zcommon/,$(ZCOMMON_OBJS_ARM64))
 
 
 # Zstd uses -O3 by default, so we should follow
 ZFS_ZSTD_FLAGS := -O3
 
 # -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h
 # Set it for other compilers, too.
 ZFS_ZSTD_FLAGS += -fno-tree-vectorize
 
 # SSE register return with SSE disabled if -march=znverX is passed
 ZFS_ZSTD_FLAGS += -U__BMI__
 
 # Quiet warnings about frame size due to unused code in unmodified zstd lib
 ZFS_ZSTD_FLAGS += -Wframe-larger-than=20480
 
 ZSTD_OBJS := \
 	zfs_zstd.o \
 	zstd_sparc.o
 
 ZSTD_UPSTREAM_OBJS := \
 	lib/common/entropy_common.o \
 	lib/common/error_private.o \
 	lib/common/fse_decompress.o \
 	lib/common/pool.o \
 	lib/common/zstd_common.o \
 	lib/compress/fse_compress.o \
 	lib/compress/hist.o \
 	lib/compress/huf_compress.o \
 	lib/compress/zstd_compress.o \
 	lib/compress/zstd_compress_literals.o \
 	lib/compress/zstd_compress_sequences.o \
 	lib/compress/zstd_compress_superblock.o \
 	lib/compress/zstd_double_fast.o \
 	lib/compress/zstd_fast.o \
 	lib/compress/zstd_lazy.o \
 	lib/compress/zstd_ldm.o \
 	lib/compress/zstd_opt.o \
 	lib/decompress/huf_decompress.o \
 	lib/decompress/zstd_ddict.o \
 	lib/decompress/zstd_decompress.o \
 	lib/decompress/zstd_decompress_block.o
 
 zfs-objs += $(addprefix zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS))
 
 # Disable aarch64 neon SIMD instructions for kernel mode
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -I$(zstd_include) $(ZFS_ZSTD_FLAGS)
 $(addprefix $(obj)/zstd/,$(ZSTD_OBJS) $(ZSTD_UPSTREAM_OBJS)) : asflags-y += -I$(zstd_include)
 $(addprefix $(obj)/zstd/,$(ZSTD_UPSTREAM_OBJS)) : ccflags-y += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w
 $(obj)/zstd/zfs_zstd.o : ccflags-y += -include $(zstd_include)/zstd_compat_wrapper.h
 
 
 ZFS_OBJS := \
 	abd.o \
 	aggsum.o \
 	arc.o \
 	blake3_zfs.o \
 	blkptr.o \
 	bplist.o \
 	bpobj.o \
 	bptree.o \
 	bqueue.o \
 	brt.o \
 	btree.o \
 	dataset_kstats.o \
 	dbuf.o \
 	dbuf_stats.o \
 	ddt.o \
 	ddt_log.o \
 	ddt_stats.o \
 	ddt_zap.o \
 	dmu.o \
 	dmu_direct.o \
 	dmu_diff.o \
 	dmu_object.o \
 	dmu_objset.o \
 	dmu_recv.o \
 	dmu_redact.o \
 	dmu_send.o \
 	dmu_traverse.o \
 	dmu_tx.o \
 	dmu_zfetch.o \
 	dnode.o \
 	dnode_sync.o \
 	dsl_bookmark.o \
 	dsl_crypt.o \
 	dsl_dataset.o \
 	dsl_deadlist.o \
 	dsl_deleg.o \
 	dsl_destroy.o \
 	dsl_dir.o \
 	dsl_pool.o \
 	dsl_prop.o \
 	dsl_scan.o \
 	dsl_synctask.o \
 	dsl_userhold.o \
 	edonr_zfs.o \
 	fm.o \
 	gzip.o \
 	hkdf.o \
 	lz4.o \
 	lz4_zfs.o \
 	lzjb.o \
 	metaslab.o \
 	mmp.o \
 	multilist.o \
 	objlist.o \
 	pathname.o \
 	range_tree.o \
 	refcount.o \
 	rrwlock.o \
 	sa.o \
 	sha2_zfs.o \
 	skein_zfs.o \
 	spa.o \
 	spa_checkpoint.o \
 	spa_config.o \
 	spa_errlog.o \
 	spa_history.o \
 	spa_log_spacemap.o \
 	spa_misc.o \
 	spa_stats.o \
 	space_map.o \
 	space_reftree.o \
 	txg.o \
 	uberblock.o \
 	unique.o \
 	vdev.o \
 	vdev_draid.o \
 	vdev_draid_rand.o \
 	vdev_file.o \
 	vdev_indirect.o \
 	vdev_indirect_births.o \
 	vdev_indirect_mapping.o \
 	vdev_initialize.o \
 	vdev_label.o \
 	vdev_mirror.o \
 	vdev_missing.o \
 	vdev_queue.o \
 	vdev_raidz.o \
 	vdev_raidz_math.o \
 	vdev_raidz_math_scalar.o \
 	vdev_rebuild.o \
 	vdev_removal.o \
 	vdev_root.o \
 	vdev_trim.o \
 	zap.o \
 	zap_leaf.o \
 	zap_micro.o \
 	zcp.o \
 	zcp_get.o \
 	zcp_global.o \
 	zcp_iter.o \
 	zcp_set.o \
 	zcp_synctask.o \
 	zfeature.o \
 	zfs_byteswap.o \
 	zfs_chksum.o \
 	zfs_debug_common.o \
 	zfs_crrd.o \
 	zfs_fm.o \
 	zfs_fuid.o \
 	zfs_impl.o \
 	zfs_ioctl.o \
 	zfs_log.o \
 	zfs_onexit.o \
 	zfs_quota.o \
 	zfs_ratelimit.o \
 	zfs_replay.o \
 	zfs_rlock.o \
 	zfs_sa.o \
 	zfs_vnops.o \
 	zfs_znode.o \
 	zil.o \
 	zio.o \
 	zio_checksum.o \
 	zio_compress.o \
 	zio_inject.o \
 	zle.o \
 	zrlock.o \
 	zthr.o \
 	zvol.o
 
 ZFS_OBJS_OS := \
 	abd_os.o \
 	arc_os.o \
 	mmp_os.o \
 	policy.o \
 	qat.o \
 	qat_compress.o \
 	qat_crypt.o \
 	spa_misc_os.o \
 	trace.o \
 	vdev_disk.o \
 	vdev_raidz.o \
 	vdev_label_os.o \
 	zfs_acl.o \
 	zfs_ctldir.o \
 	zfs_debug.o \
 	zfs_dir.o \
 	zfs_file_os.o \
 	zfs_ioctl_os.o \
 	zfs_racct.o \
 	zfs_sysfs.o \
 	zfs_uio.o \
 	zfs_vfsops.o \
 	zfs_vnops_os.o \
 	zfs_znode_os.o \
 	zio_crypt.o \
 	zpl_ctldir.o \
 	zpl_export.o \
 	zpl_file.o \
 	zpl_file_range.o \
 	zpl_inode.o \
 	zpl_super.o \
 	zpl_xattr.o \
 	zvol_os.o
 
 ZFS_OBJS_X86 := \
 	vdev_raidz_math_avx2.o \
 	vdev_raidz_math_avx512bw.o \
 	vdev_raidz_math_avx512f.o \
 	vdev_raidz_math_sse2.o \
 	vdev_raidz_math_ssse3.o
 
 ZFS_OBJS_ARM64 := \
 	vdev_raidz_math_aarch64_neon.o \
 	vdev_raidz_math_aarch64_neonx2.o
 
 ZFS_OBJS_PPC_PPC64 := \
 	vdev_raidz_math_powerpc_altivec.o
 
 zfs-objs            += $(addprefix zfs/,$(ZFS_OBJS)) $(addprefix os/linux/zfs/,$(ZFS_OBJS_OS))
 zfs-$(CONFIG_X86)   += $(addprefix zfs/,$(ZFS_OBJS_X86))
 zfs-$(CONFIG_UML_X86)+= $(addprefix zfs/,$(ZFS_OBJS_X86))
 zfs-$(CONFIG_ARM64) += $(addprefix zfs/,$(ZFS_OBJS_ARM64))
 zfs-$(CONFIG_PPC)   += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 zfs-$(CONFIG_PPC64) += $(addprefix zfs/,$(ZFS_OBJS_PPC_PPC64))
 
 UBSAN_SANITIZE_zap_leaf.o := n
 UBSAN_SANITIZE_zap_micro.o := n
 UBSAN_SANITIZE_sa.o := n
 UBSAN_SANITIZE_zfs/zap_micro.o := n
 UBSAN_SANITIZE_zfs/sa.o := n
 
 ifeq ($(CONFIG_ALTIVEC),y)
 $(obj)/zfs/vdev_raidz_math_powerpc_altivec.o : c_flags += -maltivec
 endif
 
 # The following recipes attempt to fix out of src-tree builds, where $(src) != $(obj), so that the
 # subdir %.c/%.S -> %.o targets will work as expected. The in-kernel pattern targets do not seem to
 # be working on subdirs since about ~6.10
 zobjdirs = $(dir $(zfs-objs)) $(dir $(spl-objs))                                             \
   $(dir $(zfs-$(CONFIG_X86))) $(dir $(zfs-$(CONFIG_UML_X86))) $(dir $(zfs-$(CONFIG_ARM64)))  \
   $(dir $(zfs-$(CONFIG_PPC64))) $(dir $(zfs-$(CONFIG_PPC)))
 
 z_cdirs = $(sort $(filter-out lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/) \
   $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/)                                          \
   $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/)                                             \
   $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
 z_sdirs = $(sort $(filter lua/setjmp/ $(addprefix icp/asm-aarch64/, aes/ blake3/ modes/ sha2/)     \
   $(addprefix icp/asm-x86_64/, aes/ blake3/ modes/ sha2/)                                          \
   $(addprefix icp/asm-ppc/, aes/ blake3/ modes/ sha2/)                                             \
   $(addprefix icp/asm-ppc64/, aes/ blake3/ modes/ sha2/), $(zobjdirs)))
 
 define ZKMOD_C_O_MAKE_TARGET
 $1%.o: $(src)/$1%.c FORCE
 	$$(call if_changed_rule,cc_o_c)
 	$$(call cmd,force_checksrc)
 endef
 
 define ZKMOD_S_O_MAKE_TARGET
 $1%.o: $(src)/$1%.S FORCE
 	$$(call if_changed_rule,as_o_S)
 	$$(call cmd,force_checksrc)
 endef
 
 $(foreach target,$(z_cdirs), $(eval $(call ZKMOD_C_O_MAKE_TARGET,$(target))))
 $(foreach target,$(z_sdirs), $(eval $(call ZKMOD_S_O_MAKE_TARGET,$(target))))
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
index 6d3bcca9f995..dcb0a391dda4 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha256_impl.c
@@ -1,314 +1,317 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
 #include <sys/simd.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_impl.h>
 #include <sys/sha2.h>
 
 #include <sha2/sha2_impl.h>
 #include <sys/asm_linkage.h>
 
 #define	TF(E, N) \
 	extern void ASMABI E(uint32_t s[8], const void *, size_t); \
 	static inline void N(uint32_t s[8], const void *d, size_t b) { \
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
+#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__PPC64__)
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
+#endif
 
 #if defined(__x86_64)
 
 /* Users of ASMABI requires all calls to be from wrappers */
 extern void ASMABI
 zfs_sha256_transform_x64(uint32_t s[8], const void *, size_t);
 
 static inline void
 tf_sha256_transform_x64(uint32_t s[8], const void *d, size_t b)
 {
 	zfs_sha256_transform_x64(s, d, b);
 }
 
 const sha256_ops_t sha256_x64_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha256_transform_x64,
 	.name = "x64"
 };
 
 #if defined(HAVE_SSSE3)
 static boolean_t sha2_have_ssse3(void)
 {
 	return (kfpu_allowed() && zfs_ssse3_available());
 }
 
 TF(zfs_sha256_transform_ssse3, tf_sha256_ssse3);
 const sha256_ops_t sha256_ssse3_impl = {
 	.is_supported = sha2_have_ssse3,
 	.transform = tf_sha256_ssse3,
 	.name = "ssse3"
 };
 #endif
 
 #if defined(HAVE_AVX)
 static boolean_t sha2_have_avx(void)
 {
 	return (kfpu_allowed() && zfs_avx_available());
 }
 
 TF(zfs_sha256_transform_avx, tf_sha256_avx);
 const sha256_ops_t sha256_avx_impl = {
 	.is_supported = sha2_have_avx,
 	.transform = tf_sha256_avx,
 	.name = "avx"
 };
 #endif
 
 #if defined(HAVE_AVX2)
 static boolean_t sha2_have_avx2(void)
 {
 	return (kfpu_allowed() && zfs_avx2_available());
 }
 
 TF(zfs_sha256_transform_avx2, tf_sha256_avx2);
 const sha256_ops_t sha256_avx2_impl = {
 	.is_supported = sha2_have_avx2,
 	.transform = tf_sha256_avx2,
 	.name = "avx2"
 };
 #endif
 
 #if defined(HAVE_SSE4_1)
 static boolean_t sha2_have_shani(void)
 {
 	return (kfpu_allowed() && zfs_sse4_1_available() && \
 	    zfs_shani_available());
 }
 
 TF(zfs_sha256_transform_shani, tf_sha256_shani);
 const sha256_ops_t sha256_shani_impl = {
 	.is_supported = sha2_have_shani,
 	.transform = tf_sha256_shani,
 	.name = "shani"
 };
 #endif
 
 #elif defined(__aarch64__) || defined(__arm__)
 extern void zfs_sha256_block_armv7(uint32_t s[8], const void *, size_t);
 const sha256_ops_t sha256_armv7_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = zfs_sha256_block_armv7,
 	.name = "armv7"
 };
 
 #if __ARM_ARCH > 6
 static boolean_t sha256_have_neon(void)
 {
 	return (kfpu_allowed() && zfs_neon_available());
 }
 
 static boolean_t sha256_have_armv8ce(void)
 {
 	return (kfpu_allowed() && zfs_sha256_available());
 }
 
 TF(zfs_sha256_block_neon, tf_sha256_neon);
 const sha256_ops_t sha256_neon_impl = {
 	.is_supported = sha256_have_neon,
 	.transform = tf_sha256_neon,
 	.name = "neon"
 };
 
 TF(zfs_sha256_block_armv8, tf_sha256_armv8ce);
 const sha256_ops_t sha256_armv8_impl = {
 	.is_supported = sha256_have_armv8ce,
 	.transform = tf_sha256_armv8ce,
 	.name = "armv8-ce"
 };
 #endif
 
 #elif defined(__PPC64__)
 static boolean_t sha256_have_isa207(void)
 {
 	return (kfpu_allowed() && zfs_isa207_available());
 }
 
 TF(zfs_sha256_ppc, tf_sha256_ppc);
 const sha256_ops_t sha256_ppc_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha256_ppc,
 	.name = "ppc"
 };
 
 TF(zfs_sha256_power8, tf_sha256_power8);
 const sha256_ops_t sha256_power8_impl = {
 	.is_supported = sha256_have_isa207,
 	.transform = tf_sha256_power8,
 	.name = "power8"
 };
 #endif /* __PPC64__ */
 
 /* the two generic ones */
 extern const sha256_ops_t sha256_generic_impl;
 
 /* array with all sha256 implementations */
 static const sha256_ops_t *const sha256_impls[] = {
 	&sha256_generic_impl,
 #if defined(__x86_64)
 	&sha256_x64_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_SSSE3)
 	&sha256_ssse3_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX)
 	&sha256_avx_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX2)
 	&sha256_avx2_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_SSE4_1)
 	&sha256_shani_impl,
 #endif
 #if defined(__aarch64__) || defined(__arm__)
 	&sha256_armv7_impl,
 #if __ARM_ARCH > 6
 	&sha256_neon_impl,
 	&sha256_armv8_impl,
 #endif
 #endif
 #if defined(__PPC64__)
 	&sha256_ppc_impl,
 	&sha256_power8_impl,
 #endif /* __PPC64__ */
 };
 
 /* use the generic implementation functions */
 #define	IMPL_NAME		"sha256"
 #define	IMPL_OPS_T		sha256_ops_t
 #define	IMPL_ARRAY		sha256_impls
 #define	IMPL_GET_OPS		sha256_get_ops
 #define	ZFS_IMPL_OPS		zfs_sha256_ops
 #include <generic_impl.c>
 
 #ifdef _KERNEL
 
 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
 
 #if defined(__linux__)
 
 static int
 sha256_param_get(char *buffer, zfs_kernel_param_t *unused)
 {
 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
 	char *fmt;
 	int cnt = 0;
 
 	/* cycling */
 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
 	cnt += sprintf(buffer + cnt, fmt, "cycle");
 
 	/* list fastest */
 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
 	cnt += sprintf(buffer + cnt, fmt, "fastest");
 
 	/* list all supported implementations */
 	generic_impl_init();
 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 		fmt = IMPL_FMT(impl, i);
 		cnt += sprintf(buffer + cnt, fmt,
 		    generic_supp_impls[i]->name);
 	}
 
 	return (cnt);
 }
 
 static int
 sha256_param_set(const char *val, zfs_kernel_param_t *unused)
 {
 	(void) unused;
 	return (generic_impl_setname(val));
 }
 
 #elif defined(__FreeBSD__)
 
 #include <sys/sbuf.h>
 
 static int
 sha256_param(ZFS_MODULE_PARAM_ARGS)
 {
 	int err;
 
 	generic_impl_init();
 	if (req->newptr == NULL) {
 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
 		const int init_buflen = 64;
 		const char *fmt;
 		struct sbuf *s;
 
 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
 
 		/* cycling */
 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
 		(void) sbuf_printf(s, fmt, "cycle");
 
 		/* list fastest */
 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
 		(void) sbuf_printf(s, fmt, "fastest");
 
 		/* list all supported implementations */
 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 			fmt = IMPL_FMT(impl, i);
 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
 		}
 
 		err = sbuf_finish(s);
 		sbuf_delete(s);
 
 		return (err);
 	}
 
 	char buf[16];
 
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err) {
 		return (err);
 	}
 
 	return (-generic_impl_setname(buf));
 }
 #endif
 
 #undef IMPL_FMT
 
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha256_impl,
     sha256_param_set, sha256_param_get, ZMOD_RW, \
 	"Select SHA256 implementation.");
 #endif
 
 #undef TF
diff --git a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
index 2efd9fcf4c99..a85a71a83df4 100644
--- a/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
+++ b/sys/contrib/openzfs/module/icp/algs/sha2/sha512_impl.c
@@ -1,283 +1,286 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
  */
 
 #include <sys/simd.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_impl.h>
 #include <sys/sha2.h>
 
 #include <sha2/sha2_impl.h>
 #include <sys/asm_linkage.h>
 
 #define	TF(E, N) \
 	extern void ASMABI E(uint64_t s[8], const void *, size_t); \
 	static inline void N(uint64_t s[8], const void *d, size_t b) { \
 	kfpu_begin(); E(s, d, b); kfpu_end(); \
 }
 
+#if defined(__x86_64) || defined(__aarch64__) || defined(__arm__) || \
+    defined(__aarch64__) || defined(__arm__) || defined(__PPC64__)
 /* some implementation is always okay */
 static inline boolean_t sha2_is_supported(void)
 {
 	return (B_TRUE);
 }
+#endif
 
 #if defined(__x86_64)
 
 /* Users of ASMABI requires all calls to be from wrappers */
 extern void ASMABI
 zfs_sha512_transform_x64(uint64_t s[8], const void *, size_t);
 
 static inline void
 tf_sha512_transform_x64(uint64_t s[8], const void *d, size_t b)
 {
 	zfs_sha512_transform_x64(s, d, b);
 }
 const sha512_ops_t sha512_x64_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha512_transform_x64,
 	.name = "x64"
 };
 
 #if defined(HAVE_AVX)
 static boolean_t sha2_have_avx(void)
 {
 	return (kfpu_allowed() && zfs_avx_available());
 }
 
 TF(zfs_sha512_transform_avx, tf_sha512_avx);
 const sha512_ops_t sha512_avx_impl = {
 	.is_supported = sha2_have_avx,
 	.transform = tf_sha512_avx,
 	.name = "avx"
 };
 #endif
 
 #if defined(HAVE_AVX2)
 static boolean_t sha2_have_avx2(void)
 {
 	return (kfpu_allowed() && zfs_avx2_available());
 }
 
 TF(zfs_sha512_transform_avx2, tf_sha512_avx2);
 const sha512_ops_t sha512_avx2_impl = {
 	.is_supported = sha2_have_avx2,
 	.transform = tf_sha512_avx2,
 	.name = "avx2"
 };
 #endif
 
 #elif defined(__aarch64__) || defined(__arm__)
 extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t);
 const sha512_ops_t sha512_armv7_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = zfs_sha512_block_armv7,
 	.name = "armv7"
 };
 
 #if defined(__aarch64__)
 static boolean_t sha512_have_armv8ce(void)
 {
 	return (kfpu_allowed() && zfs_sha512_available());
 }
 
 TF(zfs_sha512_block_armv8, tf_sha512_armv8ce);
 const sha512_ops_t sha512_armv8_impl = {
 	.is_supported = sha512_have_armv8ce,
 	.transform = tf_sha512_armv8ce,
 	.name = "armv8-ce"
 };
 #endif
 
 #if defined(__arm__) && __ARM_ARCH > 6
 static boolean_t sha512_have_neon(void)
 {
 	return (kfpu_allowed() && zfs_neon_available());
 }
 
 TF(zfs_sha512_block_neon, tf_sha512_neon);
 const sha512_ops_t sha512_neon_impl = {
 	.is_supported = sha512_have_neon,
 	.transform = tf_sha512_neon,
 	.name = "neon"
 };
 #endif
 
 #elif defined(__PPC64__)
 TF(zfs_sha512_ppc, tf_sha512_ppc);
 const sha512_ops_t sha512_ppc_impl = {
 	.is_supported = sha2_is_supported,
 	.transform = tf_sha512_ppc,
 	.name = "ppc"
 };
 
 static boolean_t sha512_have_isa207(void)
 {
 	return (kfpu_allowed() && zfs_isa207_available());
 }
 
 TF(zfs_sha512_power8, tf_sha512_power8);
 const sha512_ops_t sha512_power8_impl = {
 	.is_supported = sha512_have_isa207,
 	.transform = tf_sha512_power8,
 	.name = "power8"
 };
 #endif /* __PPC64__ */
 
 /* the two generic ones */
 extern const sha512_ops_t sha512_generic_impl;
 
 /* array with all sha512 implementations */
 static const sha512_ops_t *const sha512_impls[] = {
 	&sha512_generic_impl,
 #if defined(__x86_64)
 	&sha512_x64_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX)
 	&sha512_avx_impl,
 #endif
 #if defined(__x86_64) && defined(HAVE_AVX2)
 	&sha512_avx2_impl,
 #endif
 #if defined(__aarch64__) || defined(__arm__)
 	&sha512_armv7_impl,
 #if defined(__aarch64__)
 	&sha512_armv8_impl,
 #endif
 #if defined(__arm__) && __ARM_ARCH > 6
 	&sha512_neon_impl,
 #endif
 #endif
 #if defined(__PPC64__)
 	&sha512_ppc_impl,
 	&sha512_power8_impl,
 #endif /* __PPC64__ */
 };
 
 /* use the generic implementation functions */
 #define	IMPL_NAME		"sha512"
 #define	IMPL_OPS_T		sha512_ops_t
 #define	IMPL_ARRAY		sha512_impls
 #define	IMPL_GET_OPS		sha512_get_ops
 #define	ZFS_IMPL_OPS		zfs_sha512_ops
 #include <generic_impl.c>
 
 #ifdef _KERNEL
 
 #define	IMPL_FMT(impl, i)	(((impl) == (i)) ? "[%s] " : "%s ")
 
 #if defined(__linux__)
 
 static int
 sha512_param_get(char *buffer, zfs_kernel_param_t *unused)
 {
 	const uint32_t impl = IMPL_READ(generic_impl_chosen);
 	char *fmt;
 	int cnt = 0;
 
 	/* cycling */
 	fmt = IMPL_FMT(impl, IMPL_CYCLE);
 	cnt += sprintf(buffer + cnt, fmt, "cycle");
 
 	/* list fastest */
 	fmt = IMPL_FMT(impl, IMPL_FASTEST);
 	cnt += sprintf(buffer + cnt, fmt, "fastest");
 
 	/* list all supported implementations */
 	generic_impl_init();
 	for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 		fmt = IMPL_FMT(impl, i);
 		cnt += sprintf(buffer + cnt, fmt,
 		    generic_supp_impls[i]->name);
 	}
 
 	return (cnt);
 }
 
 static int
 sha512_param_set(const char *val, zfs_kernel_param_t *unused)
 {
 	(void) unused;
 	return (generic_impl_setname(val));
 }
 
 #elif defined(__FreeBSD__)
 
 #include <sys/sbuf.h>
 
 static int
 sha512_param(ZFS_MODULE_PARAM_ARGS)
 {
 	int err;
 
 	generic_impl_init();
 	if (req->newptr == NULL) {
 		const uint32_t impl = IMPL_READ(generic_impl_chosen);
 		const int init_buflen = 64;
 		const char *fmt;
 		struct sbuf *s;
 
 		s = sbuf_new_for_sysctl(NULL, NULL, init_buflen, req);
 
 		/* cycling */
 		fmt = IMPL_FMT(impl, IMPL_CYCLE);
 		(void) sbuf_printf(s, fmt, "cycle");
 
 		/* list fastest */
 		fmt = IMPL_FMT(impl, IMPL_FASTEST);
 		(void) sbuf_printf(s, fmt, "fastest");
 
 		/* list all supported implementations */
 		for (uint32_t i = 0; i < generic_supp_impls_cnt; ++i) {
 			fmt = IMPL_FMT(impl, i);
 			(void) sbuf_printf(s, fmt, generic_supp_impls[i]->name);
 		}
 
 		err = sbuf_finish(s);
 		sbuf_delete(s);
 
 		return (err);
 	}
 
 	/* we got module parameter */
 	char buf[16];
 
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err) {
 		return (err);
 	}
 
 	return (-generic_impl_setname(buf));
 }
 #endif
 
 #undef IMPL_FMT
 
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs, zfs_, sha512_impl,
     sha512_param_set, sha512_param_get, ZMOD_RW, \
 	"Select SHA512 implementation.");
 #endif
 
 #undef TF
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
index ace2360c032d..393bfaa65ff5 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/sysctl_os.c
@@ -1,826 +1,582 @@
 // SPDX-License-Identifier: BSD-2-Clause
 /*
  * Copyright (c) 2020 iXsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/errno.h>
 #include <sys/uio.h>
 #include <sys/buf.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/conf.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/arc_os.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
 #include <sys/mount.h>
 #include <sys/taskqueue.h>
 #include <sys/sdt.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zcp.h>
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
 #include <sys/dsl_crypt.h>
 
 #include <sys/zfs_ioctl_compat.h>
 #include <sys/zfs_context.h>
 
 #include <sys/arc_impl.h>
 #include <sys/dsl_pool.h>
 
 #include <sys/vmmeter.h>
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0,
 	"ZFS adaptive replacement cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0,
 	"ZFS Block Reference Table");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0,
 	"ZFS disk buf cache");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0,
 	"ZFS multihost protection");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, vnops, CTLFLAG_RW, 0, "ZFS VNOPS");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL");
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
 
 SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0,
 	"ZFS livelist condense");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, file, CTLFLAG_RW, 0, "ZFS VDEV file");
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
 	"ZFS VDEV mirror");
 
 SYSCTL_DECL(_vfs_zfs_version);
 SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD,
 	(ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version");
 
 /* arc.c */
 
 int
 param_set_arc_u64(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_64(oidp, arg1, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	arc_tuning_update(B_TRUE);
 
 	return (0);
 }
 
 int
 param_set_arc_int(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_int(oidp, arg1, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	arc_tuning_update(B_TRUE);
 
 	return (0);
 }
 
 int
 param_set_arc_max(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_arc_max;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val != 0 && (val < MIN_ARC_MAX || val <= arc_c_min ||
 	    val >= arc_all_memory()))
 		return (SET_ERROR(EINVAL));
 
 	zfs_arc_max = val;
 	arc_tuning_update(B_TRUE);
 
 	/* Update the sysctl to the tuned value */
 	if (val != 0)
 		zfs_arc_max = arc_c_max;
 
 	return (0);
 }
 
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max,
-	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_max, "LU",
-	"Maximum ARC size in bytes (LEGACY)");
-
 int
 param_set_arc_min(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_arc_min;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val != 0 && (val < 2ULL << SPA_MAXBLOCKSHIFT || val > arc_c_max))
 		return (SET_ERROR(EINVAL));
 
 	zfs_arc_min = val;
 	arc_tuning_update(B_TRUE);
 
 	/* Update the sysctl to the tuned value */
 	if (val != 0)
 		zfs_arc_min = arc_c_min;
 
 	return (0);
 }
 
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min,
-	CTLTYPE_ULONG | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_min, "LU",
-	"Minimum ARC size in bytes (LEGACY)");
-
 extern uint_t zfs_arc_free_target;
 
 int
 param_set_arc_free_target(SYSCTL_HANDLER_ARGS)
 {
 	uint_t val;
 	int err;
 
 	val = zfs_arc_free_target;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < minfree)
 		return (EINVAL);
 	if (val > vm_cnt.v_page_count)
 		return (EINVAL);
 
 	zfs_arc_free_target = val;
 
 	return (0);
 }
 
-/*
- * NOTE: This sysctl is CTLFLAG_RW not CTLFLAG_RWTUN due to its dependency on
- * pagedaemon initialization.
- */
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
-	CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_free_target, "IU",
-	"Desired number of free pages below which ARC triggers reclaim"
-	" (LEGACY)");
-
 int
 param_set_arc_no_grow_shift(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = arc_no_grow_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 0 || val >= arc_shrink_shift)
 		return (EINVAL);
 
 	arc_no_grow_shift = val;
 
 	return (0);
 }
 
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift,
-	CTLTYPE_INT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	NULL, 0, param_set_arc_no_grow_shift, "I",
-	"log2(fraction of ARC which must be free to allow growing) (LEGACY)");
-
-extern uint64_t l2arc_write_max;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max,
-	CTLFLAG_RWTUN, &l2arc_write_max, 0,
-	"Max write bytes per interval (LEGACY)");
-
-extern uint64_t l2arc_write_boost;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost,
-	CTLFLAG_RWTUN, &l2arc_write_boost, 0,
-	"Extra write bytes during device warmup (LEGACY)");
-
-extern uint64_t l2arc_headroom;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom,
-	CTLFLAG_RWTUN, &l2arc_headroom, 0,
-	"Number of max device writes to precache (LEGACY)");
-
-extern uint64_t l2arc_headroom_boost;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom_boost,
-	CTLFLAG_RWTUN, &l2arc_headroom_boost, 0,
-	"Compressed l2arc_headroom multiplier (LEGACY)");
-
-extern uint64_t l2arc_feed_secs;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs,
-	CTLFLAG_RWTUN, &l2arc_feed_secs, 0,
-	"Seconds between L2ARC writing (LEGACY)");
-
-extern uint64_t l2arc_feed_min_ms;
-
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms,
-	CTLFLAG_RWTUN, &l2arc_feed_min_ms, 0,
-	"Min feed interval in milliseconds (LEGACY)");
-
-extern int l2arc_noprefetch;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch,
-	CTLFLAG_RWTUN, &l2arc_noprefetch, 0,
-	"Skip caching prefetched buffers (LEGACY)");
-
-extern int l2arc_feed_again;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again,
-	CTLFLAG_RWTUN, &l2arc_feed_again, 0,
-	"Turbo L2ARC warmup (LEGACY)");
-
-extern int l2arc_norw;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw,
-	CTLFLAG_RWTUN, &l2arc_norw, 0,
-	"No reads during writes (LEGACY)");
-
-static int
-param_get_arc_state_size(SYSCTL_HANDLER_ARGS)
-{
-	arc_state_t *state = (arc_state_t *)arg1;
-	int64_t val;
-
-	val = zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]) +
-	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
-	return (sysctl_handle_64(oidp, &val, 0, req));
-}
-
-extern arc_state_t ARC_anon;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, anon_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_anon, 0, param_get_arc_state_size, "Q",
-	"size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
-	&ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of evictable metadata in anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
-	&ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of evictable data in anonymous state");
-
-extern arc_state_t ARC_mru;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_mru, 0, param_get_arc_state_size, "Q",
-	"size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
-	&ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of evictable metadata in mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
-	&ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of evictable data in mru state");
-
-extern arc_state_t ARC_mru_ghost;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mru_ghost_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_mru_ghost, 0, param_get_arc_state_size, "Q",
-	"size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
-	&ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of evictable metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
-	&ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of evictable data in mru ghost state");
-
-extern arc_state_t ARC_mfu;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_mfu, 0, param_get_arc_state_size, "Q",
-	"size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
-	&ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of evictable metadata in mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
-	&ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of evictable data in mfu state");
-
-extern arc_state_t ARC_mfu_ghost;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, mfu_ghost_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_mfu_ghost, 0, param_get_arc_state_size, "Q",
-	"size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
-	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of evictable metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
-	&ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of evictable data in mfu ghost state");
-
-extern arc_state_t ARC_uncached;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, uncached_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_uncached, 0, param_get_arc_state_size, "Q",
-	"size of uncached state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_metadata_esize, CTLFLAG_RD,
-	&ARC_uncached.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
-	"size of evictable metadata in uncached state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, uncached_data_esize, CTLFLAG_RD,
-	&ARC_uncached.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
-	"size of evictable data in uncached state");
-
-extern arc_state_t ARC_l2c_only;
-
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, l2c_only_size,
-	CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
-	&ARC_l2c_only, 0, param_get_arc_state_size, "Q",
-	"size of l2c_only state");
-
-/* dbuf.c */
-
-/* dmu.c */
-
-/* dmu_zfetch.c */
-
-SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)");
-
-extern uint32_t	zfetch_max_distance;
-
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance,
-	CTLFLAG_RWTUN, &zfetch_max_distance, 0,
-	"Max bytes to prefetch per stream (LEGACY)");
-
-extern uint32_t	zfetch_max_idistance;
-
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_idistance,
-	CTLFLAG_RWTUN, &zfetch_max_idistance, 0,
-	"Max bytes to prefetch indirects for per stream (LEGACY)");
-
-/* dsl_pool.c */
-
-/* dnode.c */
-
-/* dsl_scan.c */
-
 /* metaslab.c */
 
 int
 param_set_active_allocator(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int rc;
 
 	if (req->newptr == NULL)
 		strlcpy(buf, zfs_active_allocator, sizeof (buf));
 
 	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (rc || req->newptr == NULL)
 		return (rc);
 	if (strcmp(buf, zfs_active_allocator) == 0)
 		return (0);
 
 	return (param_set_active_allocator_common(buf));
 }
 
 /*
  * In pools where the log space map feature is not enabled we touch
  * multiple metaslabs (and their respective space maps) with each
  * transaction group. Thus, we benefit from having a small space map
  * block size since it allows us to issue more I/O operations scattered
  * around the disk. So a sane default for the space map block size
  * is 8~16K.
  */
 extern int zfs_metaslab_sm_blksz_no_log;
 
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log,
 	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_no_log, 0,
 	"Block size for space map in pools with log space map disabled.  "
 	"Power of 2 greater than 4096.");
 
 /*
  * When the log space map feature is enabled, we accumulate a lot of
  * changes per metaslab that are flushed once in a while so we benefit
  * from a bigger block size like 128K for the metaslab space maps.
  */
 extern int zfs_metaslab_sm_blksz_with_log;
 
 SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log,
 	CTLFLAG_RDTUN, &zfs_metaslab_sm_blksz_with_log, 0,
 	"Block size for space map in pools with log space map enabled.  "
 	"Power of 2 greater than 4096.");
 
 /*
  * The in-core space map representation is more compact than its on-disk form.
  * The zfs_condense_pct determines how much more compact the in-core
  * space map representation must be before we compact it on-disk.
  * Values should be greater than or equal to 100.
  */
 extern uint_t zfs_condense_pct;
 
 SYSCTL_UINT(_vfs_zfs, OID_AUTO, condense_pct,
 	CTLFLAG_RWTUN, &zfs_condense_pct, 0,
 	"Condense on-disk spacemap when it is more than this many percents"
 	" of in-memory counterpart");
 
-extern uint_t zfs_remove_max_segment;
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, remove_max_segment,
-	CTLFLAG_RWTUN, &zfs_remove_max_segment, 0,
-	"Largest contiguous segment ZFS will attempt to allocate when removing"
-	" a device");
-
-extern int zfs_removal_suspend_progress;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress,
-	CTLFLAG_RWTUN, &zfs_removal_suspend_progress, 0,
-	"Ensures certain actions can happen while in the middle of a removal");
-
 /*
  * Minimum size which forces the dynamic allocator to change
  * it's allocation strategy.  Once the space map cannot satisfy
  * an allocation of this size then it switches to using more
  * aggressive strategy (i.e search by size rather than offset).
  */
 extern uint64_t metaslab_df_alloc_threshold;
 
 SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold,
 	CTLFLAG_RWTUN, &metaslab_df_alloc_threshold, 0,
 	"Minimum size which forces the dynamic allocator to change its"
 	" allocation strategy");
 
 /*
  * The minimum free space, in percent, which must be available
  * in a space map to continue allocations in a first-fit fashion.
  * Once the space map's free space drops below this level we dynamically
  * switch to using best-fit allocations.
  */
 extern uint_t metaslab_df_free_pct;
 
 SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
 	CTLFLAG_RWTUN, &metaslab_df_free_pct, 0,
 	"The minimum free space, in percent, which must be available in a"
 	" space map to continue allocations in a first-fit fashion");
 
 /* mmp.c */
 
 int
 param_set_multihost_interval(SYSCTL_HANDLER_ARGS)
 {
 	int err;
 
 	err = sysctl_handle_64(oidp, &zfs_multihost_interval, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (spa_mode_global != SPA_MODE_UNINIT)
 		mmp_signal_all_threads();
 
 	return (0);
 }
 
 /* spa.c */
 
 extern int zfs_ccw_retry_interval;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval,
 	CTLFLAG_RWTUN, &zfs_ccw_retry_interval, 0,
 	"Configuration cache file write, retry after failure, interval"
 	" (seconds)");
 
 extern uint64_t zfs_max_missing_tvds_cachefile;
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile,
 	CTLFLAG_RWTUN, &zfs_max_missing_tvds_cachefile, 0,
 	"Allow importing pools with missing top-level vdevs in cache file");
 
 extern uint64_t zfs_max_missing_tvds_scan;
 
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan,
 	CTLFLAG_RWTUN, &zfs_max_missing_tvds_scan, 0,
 	"Allow importing pools with missing top-level vdevs during scan");
 
 /* spa_misc.c */
 
 extern int zfs_flags;
 
 static int
 sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
 {
 	int err, val;
 
 	val = zfs_flags;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	/*
 	 * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
 	 * arc buffers in the system have the necessary additional
 	 * checksum data.  However, it is safe to disable at any
 	 * time.
 	 */
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		val &= ~ZFS_DEBUG_MODIFY;
 	zfs_flags = val;
 
 	return (0);
 }
 
 SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
 	CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0,
 	sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
 
 int
 param_set_deadman_synctime(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_deadman_synctime_ms;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	zfs_deadman_synctime_ms = val;
 
 	spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms));
 
 	return (0);
 }
 
 int
 param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long val;
 	int err;
 
 	val = zfs_deadman_ziotime_ms;
 	err = sysctl_handle_64(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 	zfs_deadman_ziotime_ms = val;
 
 	spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms));
 
 	return (0);
 }
 
 int
 param_set_deadman_failmode(SYSCTL_HANDLER_ARGS)
 {
 	char buf[16];
 	int rc;
 
 	if (req->newptr == NULL)
 		strlcpy(buf, zfs_deadman_failmode, sizeof (buf));
 
 	rc = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (rc || req->newptr == NULL)
 		return (rc);
 	if (strcmp(buf, zfs_deadman_failmode) == 0)
 		return (0);
 	if (strcmp(buf, "wait") == 0)
 		zfs_deadman_failmode = "wait";
 	if (strcmp(buf, "continue") == 0)
 		zfs_deadman_failmode = "continue";
 	if (strcmp(buf, "panic") == 0)
 		zfs_deadman_failmode = "panic";
 
 	return (-param_set_deadman_failmode_common(buf));
 }
 
 int
 param_set_raidz_impl(SYSCTL_HANDLER_ARGS)
 {
 	const size_t bufsize = 128;
 	char *buf;
 	int rc;
 
 	buf = malloc(bufsize, M_SOLARIS, M_WAITOK | M_ZERO);
 	if (req->newptr == NULL)
 		vdev_raidz_impl_get(buf, bufsize);
 
 	rc = sysctl_handle_string(oidp, buf, bufsize, req);
 	if (rc || req->newptr == NULL) {
 		free(buf, M_SOLARIS);
 		return (rc);
 	}
 	rc = vdev_raidz_impl_set(buf);
 	free(buf, M_SOLARIS);
 	return (rc);
 }
 
 int
 param_set_slop_shift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = spa_slop_shift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
 	if (val < 1 || val > 31)
 		return (EINVAL);
 
 	spa_slop_shift = val;
 
 	return (0);
 }
 
 /* spacemap.c */
 
 extern int space_map_ibs;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN,
 	&space_map_ibs, 0, "Space map indirect block shift");
 
 
 /* vdev.c */
 
 int
 param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = zfs_vdev_min_auto_ashift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift)
 		return (SET_ERROR(EINVAL));
 
 	zfs_vdev_min_auto_ashift = val;
 
 	return (0);
 }
 
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
-	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	&zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift),
-	param_set_min_auto_ashift, "IU",
-	"Min ashift used when creating new top-level vdev. (LEGACY)");
-
 int
 param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS)
 {
 	int val;
 	int err;
 
 	val = zfs_vdev_max_auto_ashift;
 	err = sysctl_handle_int(oidp, &val, 0, req);
 	if (err != 0 || req->newptr == NULL)
 		return (SET_ERROR(err));
 
 	if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift)
 		return (SET_ERROR(EINVAL));
 
 	zfs_vdev_max_auto_ashift = val;
 
 	return (0);
 }
 
-SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
-	CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
-	&zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift),
-	param_set_max_auto_ashift, "IU",
-	"Max ashift used when optimizing for logical -> physical sector size on"
-	" new top-level vdevs. (LEGACY)");
-
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 extern int zfs_vdev_dtl_sm_blksz;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_dtl_sm_blksz, 0,
 	"Block size for DTL space map.  Power of 2 greater than 4096.");
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 extern int zfs_vdev_standard_sm_blksz;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz,
 	CTLFLAG_RDTUN, &zfs_vdev_standard_sm_blksz, 0,
 	"Block size for standard space map.  Power of 2 greater than 4096.");
 
-extern int vdev_validate_skip;
-
-SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip,
-	CTLFLAG_RDTUN, &vdev_validate_skip, 0,
-	"Enable to bypass vdev_validate().");
-
-/* vdev_mirror.c */
-
-/* vdev_queue.c */
-
-extern uint_t zfs_vdev_max_active;
-
-SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight,
-	CTLFLAG_RWTUN, &zfs_vdev_max_active, 0,
-	"The maximum number of I/Os of all types active for each device."
-	" (LEGACY)");
-
 /* zio.c */
 
 SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata,
 	CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
 	"Exclude metadata buffers from dumps as well");
diff --git a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
index 174141a5deab..120d97510c9e 100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -1,7030 +1,7049 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2025, Klara, Inc.
  */
 
 /* Portions Copyright 2007 Jeremy Teo */
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <security/mac/mac_framework.h>
 #include <sys/vfs.h>
 #include <sys/endian.h>
 #include <sys/vm.h>
 #include <sys/vnode.h>
 #include <sys/smr.h>
 #include <sys/dirent.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/kmem.h>
 #include <sys/taskq.h>
 #include <sys/uio.h>
 #include <sys/atomic.h>
 #include <sys/namei.h>
 #include <sys/mman.h>
 #include <sys/cmn_err.h>
 #include <sys/kdb.h>
 #include <sys/sysproto.h>
 #include <sys/errno.h>
 #include <sys/unistd.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
+#include <sys/dsl_dataset.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/policy.h>
 #include <sys/sunddi.h>
 #include <sys/filio.h>
 #include <sys/sid.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_sa.h>
 #include <sys/zfs_rlock.h>
 #include <sys/zfs_project.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sched.h>
 #include <sys/acl.h>
 #include <sys/vmmeter.h>
 #include <vm/vm_param.h>
 #include <sys/zil.h>
 #include <sys/zfs_vnops.h>
 #include <sys/module.h>
 #include <sys/sysent.h>
 #include <sys/dmu_impl.h>
 #include <sys/brt.h>
 #include <sys/zfeature.h>
 
 #include <vm/vm_object.h>
 
 #include <sys/extattr.h>
 #include <sys/priv.h>
 
 #ifndef VN_OPEN_INVFS
 #define	VN_OPEN_INVFS	0x0
 #endif
 
 VFS_SMR_DECLARE;
 
 #ifdef DEBUG_VFS_LOCKS
 #define	VNCHECKREF(vp)				  \
 	VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp,	\
 	    ("%s: wrong ref counts", __func__));
 #else
 #define	VNCHECKREF(vp)
 #endif
 
 #if __FreeBSD_version >= 1400045
 typedef uint64_t cookie_t;
 #else
 typedef ulong_t cookie_t;
 #endif
 
 static int zfs_check_attrname(const char *name);
 
 /*
  * Programming rules.
  *
  * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  * properly lock its in-core state, create a DMU transaction, do the work,
  * record this work in the intent log (ZIL), commit the DMU transaction,
  * and wait for the intent log to commit if it is a synchronous operation.
  * Moreover, the vnode ops must work in both normal and log replay context.
  * The ordering of events is important to avoid deadlocks and references
  * to freed memory.  The example below illustrates the following Big Rules:
  *
  *  (1)	A check must be made in each zfs thread for a mounted file system.
  *	This is done avoiding races using zfs_enter(zfsvfs).
  *	A zfs_exit(zfsvfs) is needed before all returns.  Any znodes
  *	must be checked with zfs_verify_zp(zp).  Both of these macros
  *	can return EIO from the calling function.
  *
  *  (2)	VN_RELE() should always be the last thing except for zil_commit()
  *	(if necessary) and zfs_exit(). This is for 3 reasons:
  *	First, if it's the last reference, the vnode/znode
  *	can be freed, so the zp may point to freed memory.  Second, the last
  *	reference will call zfs_zinactive(), which may induce a lot of work --
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
  *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
  *  (4) If ZPL locks are held, pass DMU_TX_NOWAIT as the second argument to
  *      dmu_tx_assign().  This is critical because we don't want to block
  *      while holding locks.
  *
  *	If no ZPL locks are held (aside from zfs_enter()), use DMU_TX_WAIT.
  *	This reduces lock contention and CPU usage when we must wait (note
  *	that if throughput is constrained by the storage, nearly every
  *	transaction must wait).
  *
  *      Note, in particular, that if a lock is sometimes acquired before
  *      the tx assigns, and sometimes after (e.g. z_lock), then failing
  *      to use a non-blocking assign can deadlock the system.  The scenario:
  *
  *	Thread A has grabbed a lock before calling dmu_tx_assign().
  *	Thread B is in an already-assigned tx, and blocks for this lock.
  *	Thread A calls dmu_tx_assign(DMU_TX_WAIT) and blocks in
  *	txg_wait_open() forever, because the previous txg can't quiesce
  *	until B's tx commits.
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is
  *	DMU_TX_NOWAIT, then drop all locks, call dmu_tx_wait(), and try
  *	again.  On subsequent calls to dmu_tx_assign(), pass
  *	DMU_TX_NOTHROTTLE in addition to DMU_TX_NOWAIT, to indicate that
  *	this operation has already called dmu_tx_wait().  This will ensure
  *	that we don't retry forever, waiting a short bit each time.
  *
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
  *	During ZIL replay the zfs_log_* functions will update the sequence
  *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
  *
  *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
  *	to ensure that synchronous semantics are provided when necessary.
  *
  * In general, this is how things should be ordered in each vnode op:
  *
  *	zfs_enter(zfsvfs);		// exit if unmounted
  * top:
  *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
  *	error = dmu_tx_assign(tx,
  *	    (waited ? DMU_TX_NOTHROTTLE : 0) | DMU_TX_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
  *		if (error == ERESTART) {
  *			waited = B_TRUE;
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
  *		}
  *		dmu_tx_abort(tx);	// abort DMU tx
  *		zfs_exit(zfsvfs);	// finished in zfs
  *		return (error);		// really out of space
  *	}
  *	error = do_real_work();		// do whatever this VOP does
  *	if (error == 0)
  *		zfs_log_*(...);		// on success, make ZIL entry
  *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
  *	rw_exit(...);			// drop locks
  *	zfs_dirent_unlock(dl);		// unlock directory entry
  *	VN_RELE(...);			// release held vnodes
  *	zil_commit(zilog, foid);	// synchronous when necessary
  *	zfs_exit(zfsvfs);		// finished in zfs
  *	return (error);			// done, report error
  */
 static int
 zfs_open(vnode_t **vpp, int flag, cred_t *cr)
 {
 	(void) cr;
 	znode_t	*zp = VTOZ(*vpp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 	    ((flag & FAPPEND) == 0)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Keep a count of the synchronous opens in the znode.  On first
 	 * synchronous open we must convert all previous async transactions
 	 * into sync to keep correct ordering.
 	 */
 	if (flag & O_SYNC) {
 		if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1)
 			zil_async_to_sync(zfsvfs->z_log, zp->z_id);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 static int
 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
 {
 	(void) offset, (void) cr;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/* Decrement the synchronous opens in the znode */
 	if ((flag & O_SYNC) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 static int
 zfs_ioctl_getxattr(vnode_t *vp, zfsxattr_t *fsx)
 {
 	znode_t *zp = VTOZ(vp);
 
 	memset(fsx, 0, sizeof (*fsx));
 	fsx->fsx_xflags = (zp->z_pflags & ZFS_PROJINHERIT) ?
 	    ZFS_PROJINHERIT_FL : 0;
 	fsx->fsx_projid = zp->z_projid;
 
 	return (0);
 }
 
 static int
 zfs_ioctl_setflags(vnode_t *vp, uint32_t ioctl_flags, xvattr_t *xva)
 {
 	uint64_t zfs_flags = VTOZ(vp)->z_pflags;
 	xoptattr_t *xoap;
 
 	if (ioctl_flags & ~(ZFS_PROJINHERIT_FL))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	xva_init(xva);
 	xoap = xva_getxoptattr(xva);
 
 #define	FLAG_CHANGE(iflag, zflag, xflag, xfield)	do {		\
 	if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) ||	\
 	    ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) {	\
 		XVA_SET_REQ(xva, (xflag));				\
 		(xfield) = ((ioctl_flags & (iflag)) != 0);		\
 	}								\
 } while (0)
 
 	FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT,
 	    xoap->xoa_projinherit);
 
 #undef	FLAG_CHANGE
 
 	return (0);
 }
 
 static int
 zfs_ioctl_setxattr(vnode_t *vp, zfsxattr_t *fsx, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	xvattr_t xva;
 	xoptattr_t *xoap;
 	int err;
 
 	if (!zpl_is_valid_projid(fsx->fsx_projid))
 		return (SET_ERROR(EINVAL));
 
 	err = zfs_ioctl_setflags(vp, fsx->fsx_xflags, &xva);
 	if (err)
 		return (err);
 
 	xoap = xva_getxoptattr(&xva);
 	XVA_SET_REQ(&xva, XAT_PROJID);
 	xoap->xoa_projid = fsx->fsx_projid;
 
 	err = zfs_setattr(zp, (vattr_t *)&xva, 0, cr, NULL);
 
 	return (err);
 }
 
 static int
 zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred,
     int *rvalp)
 {
 	(void) flag, (void) cred, (void) rvalp;
 	loff_t off;
 	int error;
 
 	switch (com) {
 	case _FIOFFS:
 	{
 		return (0);
 
 		/*
 		 * The following two ioctls are used by bfu.  Faking out,
 		 * necessary to avoid bfu errors.
 		 */
 	}
 	case _FIOGDIO:
 	case _FIOSDIO:
 	{
 		return (0);
 	}
 
 	case F_SEEK_DATA:
 	case F_SEEK_HOLE:
 	{
 		off = *(offset_t *)data;
 		error = vn_lock(vp, LK_SHARED);
 		if (error)
 			return (error);
 		/* offset parameter is in/out */
 		error = zfs_holey(VTOZ(vp), com, &off);
 		VOP_UNLOCK(vp);
 		if (error)
 			return (error);
 		*(offset_t *)data = off;
 		return (0);
 	}
 	case ZFS_IOC_FSGETXATTR: {
 		zfsxattr_t *fsx = (zfsxattr_t *)data;
 		error = vn_lock(vp, LK_SHARED);
 		if (error)
 			return (error);
 		error = zfs_ioctl_getxattr(vp, fsx);
 		VOP_UNLOCK(vp);
 		return (error);
 	}
 	case ZFS_IOC_FSSETXATTR: {
 		zfsxattr_t *fsx = (zfsxattr_t *)data;
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error)
 			return (error);
 		error = zfs_ioctl_setxattr(vp, fsx, cred);
 		VOP_UNLOCK(vp);
 		return (error);
 	}
 	case ZFS_IOC_REWRITE: {
 		zfs_rewrite_args_t *args = (zfs_rewrite_args_t *)data;
 		if ((flag & FWRITE) == 0)
 			return (SET_ERROR(EBADF));
 		error = vn_lock(vp, LK_SHARED);
 		if (error)
 			return (error);
 		error = zfs_rewrite(VTOZ(vp), args->off, args->len,
 		    args->flags, args->arg);
 		VOP_UNLOCK(vp);
 		return (error);
 	}
 	}
 	return (SET_ERROR(ENOTTY));
 }
 
 static vm_page_t
 page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
 {
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t end;
 
 	/*
 	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
 	 * aligned boundaries, if the range is not aligned.  As a result a
 	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
 	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
 	 * the whole page would be considered clean despite have some
 	 * dirty data.
 	 * For this reason we should shrink the range to DEV_BSIZE aligned
 	 * boundaries before calling vm_page_clear_dirty.
 	 */
 	end = rounddown2(off + nbytes, DEV_BSIZE);
 	off = roundup2(off, DEV_BSIZE);
 	nbytes = end - off;
 
 	obj = vp->v_object;
 	vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start),
 	    VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL |
 	    VM_ALLOC_IGN_SBUSY);
 	if (pp != NULL) {
 		ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 		vm_object_pip_add(obj, 1);
 		pmap_remove_write(pp);
 		if (nbytes != 0)
 			vm_page_clear_dirty(pp, off, nbytes);
 	}
 	return (pp);
 }
 
 static void
 page_unbusy(vm_page_t pp)
 {
 
 	vm_page_sunbusy(pp);
 	vm_object_pip_wakeup(pp->object);
 }
 
 static vm_page_t
 page_hold(vnode_t *vp, int64_t start)
 {
 	vm_object_t obj;
 	vm_page_t m;
 
 	obj = vp->v_object;
 	vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start),
 	    VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY |
 	    VM_ALLOC_NOBUSY);
 	return (m);
 }
 
 static void
 page_unhold(vm_page_t pp)
 {
 	vm_page_unwire(pp, PQ_ACTIVE);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
  */
 void
 update_pages(znode_t *zp, int64_t start, int len, objset_t *os)
 {
 	vm_object_t obj;
 	struct sf_buf *sf;
 	vnode_t *vp = ZTOV(zp);
 	caddr_t va;
 	int off;
 
 	ASSERT3P(vp->v_mount, !=, NULL);
 	obj = vp->v_object;
 	ASSERT3P(obj, !=, NULL);
 
 	off = start & PAGEOFFSET;
 	vm_object_pip_add(obj, 1);
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		int nbytes = imin(PAGESIZE - off, len);
 
 		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
 			va = zfs_map_page(pp, &sf);
 			(void) dmu_read(os, zp->z_id, start + off, nbytes,
 			    va + off, DMU_READ_PREFETCH);
 			zfs_unmap_page(sf);
 			page_unbusy(pp);
 		}
 		len -= nbytes;
 		off = 0;
 	}
 	vm_object_pip_wakeup(obj);
 }
 
 /*
  * Read with UIO_NOCOPY flag means that sendfile(2) requests
  * ZFS to populate a range of page cache pages with data.
  *
  * NOTE: this function could be optimized to pre-allocate
  * all pages in advance, drain exclusive busy on all of them,
  * map them into contiguous KVA region and populate them
  * in one single dmu_read() call.
  */
 int
 mappedread_sf(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	vnode_t *vp = ZTOV(zp);
 	objset_t *os = zp->z_zfsvfs->z_os;
 	struct sf_buf *sf;
 	vm_object_t obj;
 	vm_page_t pp;
 	int64_t start;
 	caddr_t va;
 	int len = nbytes;
 	int error = 0;
 
 	ASSERT3U(zfs_uio_segflg(uio), ==, UIO_NOCOPY);
 	ASSERT3P(vp->v_mount, !=, NULL);
 	obj = vp->v_object;
 	ASSERT3P(obj, !=, NULL);
 	ASSERT0(zfs_uio_offset(uio) & PAGEOFFSET);
 
 	for (start = zfs_uio_offset(uio); len > 0; start += PAGESIZE) {
 		int bytes = MIN(PAGESIZE, len);
 
 		pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start),
 		    VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
 		if (vm_page_none_valid(pp)) {
 			va = zfs_map_page(pp, &sf);
 			error = dmu_read(os, zp->z_id, start, bytes, va,
 			    DMU_READ_PREFETCH);
 			if (bytes != PAGESIZE && error == 0)
 				memset(va + bytes, 0, PAGESIZE - bytes);
 			zfs_unmap_page(sf);
 			if (error == 0) {
 				vm_page_valid(pp);
 				vm_page_activate(pp);
 				vm_page_sunbusy(pp);
 			} else {
 				zfs_vmobject_wlock(obj);
 				if (!vm_page_wired(pp) && pp->valid == 0 &&
 				    vm_page_busy_tryupgrade(pp))
 					vm_page_free(pp);
 				else {
 					vm_page_deactivate_noreuse(pp);
 					vm_page_sunbusy(pp);
 				}
 				zfs_vmobject_wunlock(obj);
 			}
 		} else {
 			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
 			vm_page_sunbusy(pp);
 		}
 		if (error)
 			break;
 		zfs_uio_advance(uio, bytes);
 		len -= bytes;
 	}
 	return (error);
 }
 
 /*
  * When a file is memory mapped, we must keep the IO data synchronized
  * between the DMU cache and the memory mapped pages.  What this means:
  *
  * On Read:	We "read" preferentially from memory mapped pages,
  *		else we default from the dmu buffer.
  *
  * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
  *	 the file is memory mapped.
  */
 int
 mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio)
 {
 	vnode_t *vp = ZTOV(zp);
 	vm_object_t obj;
 	int64_t start;
 	int len = nbytes;
 	int off;
 	int error = 0;
 
 	ASSERT3P(vp->v_mount, !=, NULL);
 	obj = vp->v_object;
 	ASSERT3P(obj, !=, NULL);
 
 	start = zfs_uio_offset(uio);
 	off = start & PAGEOFFSET;
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		vm_page_t pp;
 		uint64_t bytes = MIN(PAGESIZE - off, len);
 
 		if ((pp = page_hold(vp, start))) {
 			struct sf_buf *sf;
 			caddr_t va;
 
 			va = zfs_map_page(pp, &sf);
 			error = vn_io_fault_uiomove(va + off, bytes,
 			    GET_UIO_STRUCT(uio));
 			zfs_unmap_page(sf);
 			page_unhold(pp);
 		} else {
 			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 			    uio, bytes, DMU_READ_PREFETCH);
 		}
 		len -= bytes;
 		off = 0;
 		if (error)
 			break;
 	}
 	return (error);
 }
 
 int
 zfs_write_simple(znode_t *zp, const void *data, size_t len,
     loff_t pos, size_t *presid)
 {
 	int error = 0;
 	ssize_t resid;
 
 	error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos,
 	    UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread);
 
 	if (error) {
 		return (SET_ERROR(error));
 	} else if (presid == NULL) {
 		if (resid != 0) {
 			error = SET_ERROR(EIO);
 		}
 	} else {
 		*presid = resid;
 	}
 	return (error);
 }
 
 void
 zfs_zrele_async(znode_t *zp)
 {
 	vnode_t *vp = ZTOV(zp);
 	objset_t *os = ITOZSB(vp)->z_os;
 
 	VN_RELE_ASYNC(vp, dsl_pool_zrele_taskq(dmu_objset_pool(os)));
 }
 
 static int
 zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	int error;
 
 	*vpp = arg;
 	error = vn_lock(*vpp, lkflags);
 	if (error != 0)
 		vrele(*vpp);
 	return (error);
 }
 
 static int
 zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs;
 	int error;
 	int ltype;
 
 	if (zfsvfs->z_replay == B_FALSE)
 		ASSERT_VOP_LOCKED(dvp, __func__);
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		ASSERT3P(dvp, ==, vp);
 		vref(dvp);
 		ltype = lkflags & LK_TYPE_MASK;
 		if (ltype != VOP_ISLOCKED(dvp)) {
 			if (ltype == LK_EXCLUSIVE)
 				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 			else /* if (ltype == LK_SHARED) */
 				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
 
 			/*
 			 * Relock for the "." case could leave us with
 			 * reclaimed vnode.
 			 */
 			if (VN_IS_DOOMED(dvp)) {
 				vrele(dvp);
 				return (SET_ERROR(ENOENT));
 			}
 		}
 		return (0);
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		/*
 		 * Note that in this case, dvp is the child vnode, and we
 		 * are looking up the parent vnode - exactly reverse from
 		 * normal operation.  Unlocking dvp requires some rather
 		 * tricky unlock/relock dance to prevent mp from being freed;
 		 * use vn_vget_ino_gen() which takes care of all that.
 		 *
 		 * XXX Note that there is a time window when both vnodes are
 		 * unlocked.  It is possible, although highly unlikely, that
 		 * during that window the parent-child relationship between
 		 * the vnodes may change, for example, get reversed.
 		 * In that case we would have a wrong lock order for the vnodes.
 		 * All other filesystems seem to ignore this problem, so we
 		 * do the same here.
 		 * A potential solution could be implemented as follows:
 		 * - using LK_NOWAIT when locking the second vnode and retrying
 		 *   if necessary
 		 * - checking that the parent-child relationship still holds
 		 *   after locking both vnodes and retrying if it doesn't
 		 */
 		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
 		return (error);
 	} else {
 		error = vn_lock(vp, lkflags);
 		if (error != 0)
 			vrele(vp);
 		return (error);
 	}
 }
 
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
  *
  *	IN:	dvp	- vnode of directory to search.
  *		nm	- name of entry to lookup.
  *		pnp	- full pathname to lookup [UNUSED].
  *		flags	- LOOKUP_XATTR set if looking for an attribute.
  *		rdir	- root directory vnode [UNUSED].
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	vpp	- vnode of located entry, NULL if not found.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	NA
  */
 static int
 zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp,
     struct componentname *cnp, int nameiop, cred_t *cr, int flags,
     boolean_t cached)
 {
 	znode_t *zdp = VTOZ(dvp);
 	znode_t *zp;
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
 	seqc_t dvp_seqc;
 	int	error = 0;
 
 	/*
 	 * Fast path lookup, however we must skip DNLC lookup
 	 * for case folding or normalizing lookups because the
 	 * DNLC code only stores the passed in name.  This means
 	 * creating 'a' and removing 'A' on a case insensitive
 	 * file system would work, but DNLC still thinks 'a'
 	 * exists and won't let you create it again on the next
 	 * pass through fast path.
 	 */
 	if (!(flags & LOOKUP_XATTR)) {
 		if (dvp->v_type != VDIR) {
 			return (SET_ERROR(ENOTDIR));
 		} else if (zdp->z_sa_hdl == NULL) {
 			return (SET_ERROR(EIO));
 		}
 	}
 
 	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp,
 	    const char *, nm);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0)
 		return (error);
 
 	dvp_seqc = vn_seqc_read_notmodify(dvp);
 
 	*vpp = NULL;
 
 	if (flags & LOOKUP_XATTR) {
 		/*
 		 * If the xattr property is off, refuse the lookup request.
 		 */
 		if (!(zfsvfs->z_flags & ZSB_XATTR)) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOPNOTSUPP));
 		}
 
 		/*
 		 * We don't allow recursive attributes..
 		 * Maybe someday we will.
 		 */
 		if (zdp->z_pflags & ZFS_XATTR) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 
 		if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 		*vpp = ZTOV(zp);
 
 		/*
 		 * Do we have permission to get into attribute directory?
 		 */
 		if (flags & LOOKUP_NAMED_ATTR)
 			error = zfs_zaccess(zp, ACE_EXECUTE, V_NAMEDATTR,
 			    B_FALSE, cr, NULL);
 		else
 			error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr,
 			    NULL);
 		if (error) {
 			vrele(ZTOV(zp));
 		}
 
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Check accessibility of directory if we're not coming in via
 	 * VOP_CACHEDLOOKUP.
 	 */
 	if (!cached) {
 #ifdef NOEXECCHECK
 		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
 			cnp->cn_flags &= ~NOEXECCHECK;
 		} else
 #endif
 		if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr,
 		    NULL))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 
 	/*
 	 * First handle the special cases.
 	 */
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * If we are a snapshot mounted under .zfs, return
 		 * the vp for the snapshot directory.
 		 */
 		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
 			struct componentname cn;
 			vnode_t *zfsctl_vp;
 			int ltype;
 
 			zfs_exit(zfsvfs, FTAG);
 			ltype = VOP_ISLOCKED(dvp);
 			VOP_UNLOCK(dvp);
 			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
 			    &zfsctl_vp);
 			if (error == 0) {
 				cn.cn_nameptr = "snapshot";
 				cn.cn_namelen = strlen(cn.cn_nameptr);
 				cn.cn_nameiop = cnp->cn_nameiop;
 				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
 				cn.cn_lkflags = cnp->cn_lkflags;
 				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
 				vput(zfsctl_vp);
 			}
 			vn_lock(dvp, ltype | LK_RETRY);
 			return (error);
 		}
 	}
 	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
 		zfs_exit(zfsvfs, FTAG);
 		if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED)
 			return (SET_ERROR(ENOENT));
 		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
 			return (SET_ERROR(ENOTSUP));
 		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
 		return (error);
 	}
 
 	/*
 	 * The loop is retry the lookup if the parent-child relationship
 	 * changes during the dot-dot locking complexities.
 	 */
 	for (;;) {
 		uint64_t parent;
 
 		error = zfs_dirlook(zdp, nm, &zp);
 		if (error == 0)
 			*vpp = ZTOV(zp);
 
 		zfs_exit(zfsvfs, FTAG);
 		if (error != 0)
 			break;
 
 		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
 		if (error != 0) {
 			/*
 			 * If we've got a locking error, then the vnode
 			 * got reclaimed because of a force unmount.
 			 * We never enter doomed vnodes into the name cache.
 			 */
 			*vpp = NULL;
 			return (error);
 		}
 
 		if ((cnp->cn_flags & ISDOTDOT) == 0)
 			break;
 
 		if ((error = zfs_enter(zfsvfs, FTAG)) != 0) {
 			vput(ZTOV(zp));
 			*vpp = NULL;
 			return (error);
 		}
 		if (zdp->z_sa_hdl == NULL) {
 			error = SET_ERROR(EIO);
 		} else {
 			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 			    &parent, sizeof (parent));
 		}
 		if (error != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			vput(ZTOV(zp));
 			break;
 		}
 		if (zp->z_id == parent) {
 			zfs_exit(zfsvfs, FTAG);
 			break;
 		}
 		vput(ZTOV(zp));
 	}
 
 	if (error != 0)
 		*vpp = NULL;
 
 	/* Translate errors and add SAVENAME when needed. */
 	if (cnp->cn_flags & ISLASTCN) {
 		switch (nameiop) {
 		case CREATE:
 		case RENAME:
 			if (error == ENOENT) {
 				error = EJUSTRETURN;
 #if __FreeBSD_version < 1400068
 				cnp->cn_flags |= SAVENAME;
 #endif
 				break;
 			}
 			zfs_fallthrough;
 		case DELETE:
 #if __FreeBSD_version < 1400068
 			if (error == 0)
 				cnp->cn_flags |= SAVENAME;
 #endif
 			break;
 		}
 	}
 
 	if ((cnp->cn_flags & ISDOTDOT) != 0) {
 		/*
 		 * FIXME: zfs_lookup_lock relocks vnodes and does nothing to
 		 * handle races. In particular different callers may end up
 		 * with different vnodes and will try to add conflicting
 		 * entries to the namecache.
 		 *
 		 * While finding different result may be acceptable in face
 		 * of concurrent modification, adding conflicting entries
 		 * trips over an assert in the namecache.
 		 *
 		 * Ultimately let an entry through once everything settles.
 		 */
 		if (!vn_seqc_consistent(dvp, dvp_seqc)) {
 			cnp->cn_flags &= ~MAKEENTRY;
 		}
 	}
 
 	/* Insert name into cache (as non-existent) if appropriate. */
 	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(dvp, NULL, cnp);
 
 	/* Insert name into cache if appropriate. */
 	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay &&
 	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
 		if (!(cnp->cn_flags & ISLASTCN) ||
 		    (nameiop != DELETE && nameiop != RENAME)) {
 			cache_enter(dvp, *vpp, cnp);
 		}
 	}
 
 	return (error);
 }
 
 static inline bool
 is_nametoolong(zfsvfs_t *zfsvfs, const char *name)
 {
 	size_t dlen = strlen(name);
 	return ((!zfsvfs->z_longname && dlen >= ZAP_MAXNAMELEN) ||
 	    dlen >= ZAP_MAXNAMELEN_NEW);
 }
 
 /*
  * Attempt to create a new entry in a directory.  If the entry
  * already exists, truncate the file if permissible, else return
  * an error.  Return the vp of the created or trunc'd file.
  *
  *	IN:	dvp	- vnode of directory to put new file entry in.
  *		name	- name of new file entry.
  *		vap	- attributes of new file.
  *		excl	- flag indicating exclusive or non-exclusive mode.
  *		mode	- mode to open file with.
  *		cr	- credentials of caller.
  *		flag	- large file flag [UNUSED].
  *		ct	- caller context
  *		vsecp	- ACL to be set
  *		mnt_ns	- Unused on FreeBSD
  *
  *	OUT:	vpp	- vnode of created or trunc'd entry.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated if new entry created
  *	 vp - ctime|mtime always, atime if new
  */
 int
 zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode,
     znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 {
 	(void) excl, (void) mode, (void) flag;
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	objset_t	*os;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid = crgetuid(cr);
 	gid_t		gid = crgetgid(cr);
 	uint64_t	projid = ZFS_DEFAULT_PROJID;
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype;
 #ifdef DEBUG_VFS_LOCKS
 	vnode_t	*dvp = ZTOV(dzp);
 #endif
 
 	if (is_nametoolong(zfsvfs, name))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (vsecp || (vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	*zpp = NULL;
 
 	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
 		vap->va_mode &= ~S_ISVTX;
 
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	ASSERT0P(zp);
 
 	/*
 	 * Create a new file object and update the directory
 	 * to reference it.
 	 */
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		goto out;
 	}
 
 	/*
 	 * We only support the creation of regular files in
 	 * extended attribute directories.
 	 */
 
 	if ((dzp->z_pflags & ZFS_XATTR) &&
 	    (vap->va_type != VREG)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap,
 	    cr, vsecp, &acl_ids, NULL)) != 0)
 		goto out;
 
 	if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode))
 		projid = zfs_inherit_projid(dzp);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) {
 		zfs_acl_ids_free(&acl_ids);
 		error = SET_ERROR(EDQUOT);
 		goto out;
 	}
 
 	getnewvnode_reserve();
 
 	tx = dmu_tx_create(os);
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa &&
 	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, acl_ids.z_aclp->z_acl_bytes);
 	}
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	error = zfs_link_create(dzp, name, zp, tx, ZNEW);
 	if (error != 0) {
 		/*
 		 * Since, we failed to add the directory entry for it,
 		 * delete the newly created dnode.
 		 */
 		zfs_znode_delete(zp, tx);
 		VOP_UNLOCK(ZTOV(zp));
 		zrele(zp);
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 		getnewvnode_drop_reserve();
 		goto out;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
 	    vsecp, acl_ids.z_fuidp, vap);
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 out:
 	VNCHECKREF(dvp);
 	if (error == 0) {
 		*zpp = zp;
 	}
 
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove an entry from a directory.
  *
  *	IN:	dvp	- vnode of directory to remove entry from.
  *		name	- name of entry to remove.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime
  *	 vp - ctime (if nlink > 0)
  */
 static int
 zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp;
 	znode_t		*xzp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	xattr_obj;
 	uint64_t	obj = 0;
 	dmu_tx_t	*tx;
 	boolean_t	unlinked;
 	uint64_t	txtype;
 	int		error;
 
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zp = VTOZ(vp);
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zilog = zfsvfs->z_log;
 
 	xattr_obj = 0;
 	xzp = NULL;
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
 		goto out;
 	}
 
 	/*
 	 * Need to use rmdir for removing directories.
 	 */
 	if (vp->v_type == VDIR) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	vnevent_remove(vp, dvp, name, ct);
 
 	obj = zp->z_id;
 
 	/* are there any extended attributes? */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 	}
 
 	/*
 	 * We may delete the znode now, or we may put it in the unlinked set;
 	 * it depends on whether we're the last link, and on whether there are
 	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
 	 * allow for either case.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 
 	if (xzp) {
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
 	/*
 	 * Mark this transaction as typically resulting in a net free of space
 	 */
 	dmu_tx_mark_netfree(tx);
 
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Remove the directory entry.
 	 */
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	if (unlinked) {
 		zfs_unlinked_add(zp, tx);
 		vp->v_vflag |= VV_NOSYNC;
 	}
 	/* XXX check changes to linux vnops */
 	txtype = TX_REMOVE;
 	zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked);
 
 	dmu_tx_commit(tx);
 out:
 
 	if (xzp)
 		vrele(ZTOV(xzp));
 
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 
 static int
 zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp,
     struct componentname *cnp, int nameiop)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	int error;
 
 	cnp->cn_nameptr = __DECONST(char *, name);
 	cnp->cn_namelen = strlen(name);
 	cnp->cn_nameiop = nameiop;
 	cnp->cn_flags = ISLASTCN;
 #if __FreeBSD_version < 1400068
 	cnp->cn_flags |= SAVENAME;
 #endif
 	cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
 	cnp->cn_cred = kcred;
 #if __FreeBSD_version < 1400037
 	cnp->cn_thread = curthread;
 #endif
 
 	if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) {
 		struct vop_lookup_args a;
 
 		a.a_gen.a_desc = &vop_lookup_desc;
 		a.a_dvp = ZTOV(dzp);
 		a.a_vpp = vpp;
 		a.a_cnp = cnp;
 		error = vfs_cache_lookup(&a);
 	} else {
 		error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, 0,
 		    B_FALSE);
 	}
 #ifdef ZFS_DEBUG
 	if (error) {
 		printf("got error %d on name %s on op %d\n", error, name,
 		    nameiop);
 		kdb_backtrace();
 	}
 #endif
 	return (error);
 }
 
 int
 zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags)
 {
 	vnode_t *vp;
 	int error;
 	struct componentname cn;
 
 	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
 		return (error);
 
 	error = zfs_remove_(ZTOV(dzp), vp, name, cr);
 	vput(vp);
 	return (error);
 }
 /*
  * Create a new directory and insert it into dvp using the name
  * provided.  Return a pointer to the inserted directory.
  *
  *	IN:	dvp	- vnode of directory to add subdir to.
  *		dirname	- name of new directory.
  *		vap	- attributes of new directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		vsecp	- ACL to be set
  *		mnt_ns	- Unused on FreeBSD
  *
  *	OUT:	vpp	- vnode of created directory.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  *	 vp - ctime|mtime|atime updated
  */
 int
 zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp,
     cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns)
 {
 	(void) flags, (void) vsecp;
 	znode_t		*zp;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
 	uid_t		uid = crgetuid(cr);
 	gid_t		gid = crgetgid(cr);
 	zfs_acl_ids_t   acl_ids;
 	boolean_t	fuid_dirtied;
 
 	ASSERT3U(vap->va_type, ==, VDIR);
 
 	if (is_nametoolong(zfsvfs, dirname))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
 	 * make sure file system is at proper version
 	 */
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    ((vap->va_mask & AT_XVATTR) ||
 	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
 		return (SET_ERROR(EINVAL));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (dzp->z_pflags & ZFS_XATTR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(dirname,
 	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (vap->va_mask & AT_XVATTR) {
 		if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap,
 		    crgetuid(cr), cr, vap->va_type)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
 	    NULL, &acl_ids, NULL)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * First make sure the new directory doesn't exist.
 	 *
 	 * Existence is checked first to make sure we don't return
 	 * EACCES instead of EEXIST which can cause some applications
 	 * to fail.
 	 */
 	*zpp = NULL;
 
 	if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	ASSERT0P(zp);
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr,
 	    mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	/*
 	 * Add a new entry to the directory.
 	 */
 	getnewvnode_reserve();
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	/*
 	 * Now put new name in parent dir.
 	 */
 	error = zfs_link_create(dzp, dirname, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		VOP_UNLOCK(ZTOV(zp));
 		zrele(zp);
 		goto out;
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	*zpp = zp;
 
 	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
 	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
 	    acl_ids.z_fuidp, vap);
 
 out:
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Remove a directory subdir entry.  If the current working
  * directory is the same as the subdir to be removed, the
  * remove will fail.
  *
  *	IN:	dvp	- vnode of directory to remove from.
  *		name	- name of directory to be removed.
  *		cwd	- vnode of current working directory.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 static int
 zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr)
 {
 	znode_t		*dzp = VTOZ(dvp);
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	if ((error = zfs_verify_zp(zp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zilog = zfsvfs->z_log;
 
 
 	if ((error = zfs_zaccess_delete(dzp, zp, cr, NULL))) {
 		goto out;
 	}
 
 	if (vp->v_type != VDIR) {
 		error = SET_ERROR(ENOTDIR);
 		goto out;
 	}
 
 	vnevent_rmdir(vp, dvp, name, ct);
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
 
 	if (error == 0) {
 		uint64_t txtype = TX_RMDIR;
 		zfs_log_remove(zilog, tx, txtype, dzp, name,
 		    ZFS_NO_OBJECT, B_FALSE);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (zfsvfs->z_use_namecache)
 		cache_vop_rmdir(dvp, vp);
 out:
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 int
 zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags)
 {
 	struct componentname cn;
 	vnode_t *vp;
 	int error;
 
 	if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE)))
 		return (error);
 
 	error = zfs_rmdir_(ZTOV(dzp), vp, name, cr);
 	vput(vp);
 	return (error);
 }
 
 /*
  * Read as many directory entries as will fit into the provided
  * buffer from the given directory cursor position (specified in
  * the uio structure).
  *
  *	IN:	vp	- vnode of directory to read.
  *		uio	- structure supplying read location, range info,
  *			  and return buffer.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- updated offset and range, buffer filled.
  *		eofp	- set to true if end-of-file detected.
  *		ncookies- number of entries in cookies
  *		cookies	- offsets to directory entries
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  *
  * Note that the low 4 bits of the cookie returned by zap is always zero.
  * This allows us to use the low range for "special" directory entries:
  * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
  * we use the offset 2 for the '.zfs' directory.
  */
 static int
 zfs_readdir(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, int *eofp,
     int *ncookies, cookie_t **cookies)
 {
 	znode_t		*zp = VTOZ(vp);
 	iovec_t		*iovp;
 	dirent64_t	*odp;
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	caddr_t		outbuf;
 	size_t		bufsize;
 	ssize_t		orig_resid;
 	zap_cursor_t	zc;
 	zap_attribute_t	*zap;
 	uint_t		bytes_wanted;
 	uint64_t	offset; /* must be unsigned; checks for < 1 */
 	uint64_t	parent;
 	int		local_eof;
 	int		outcount;
 	int		error;
 	uint8_t		prefetch;
 	uint8_t		type;
 	int		ncooks;
 	cookie_t	*cooks = NULL;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are not given an eof variable,
 	 * use a local one.
 	 */
 	if (eofp == NULL)
 		eofp = &local_eof;
 
 	/*
 	 * Check for valid iov_len.
 	 */
 	if (GET_UIO_STRUCT(uio)->uio_iov->iov_len <= 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Quit if directory has been removed (posix)
 	 */
 	if ((*eofp = (zp->z_unlinked != 0)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	error = 0;
 	os = zfsvfs->z_os;
 	offset = zfs_uio_offset(uio);
 	orig_resid = zfs_uio_resid(uio);
 	prefetch = zp->z_zn_prefetch;
 	zap = zap_attribute_long_alloc();
 
 	/*
 	 * Initialize the iterator cursor.
 	 */
 	if (offset <= 3) {
 		/*
 		 * Start iteration from the beginning of the directory.
 		 */
 		zap_cursor_init(&zc, os, zp->z_id);
 	} else {
 		/*
 		 * The offset is a serialized cursor.
 		 */
 		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
 	}
 
 	/*
 	 * Get space to change directory entries into fs independent format.
 	 */
 	iovp = GET_UIO_STRUCT(uio)->uio_iov;
 	bytes_wanted = iovp->iov_len;
 	if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1) {
 		bufsize = bytes_wanted;
 		outbuf = kmem_alloc(bufsize, KM_SLEEP);
 		odp = (struct dirent64 *)outbuf;
 	} else {
 		bufsize = bytes_wanted;
 		outbuf = NULL;
 		odp = (struct dirent64 *)iovp->iov_base;
 	}
 
 	if (ncookies != NULL) {
 		/*
 		 * Minimum entry size is dirent size and 1 byte for a file name.
 		 */
 		ncooks = zfs_uio_resid(uio) / (sizeof (struct dirent) -
 		    sizeof (((struct dirent *)NULL)->d_name) + 1);
 		cooks = malloc(ncooks * sizeof (*cooks), M_TEMP, M_WAITOK);
 		*cookies = cooks;
 		*ncookies = ncooks;
 	}
 
 	/*
 	 * Transform to file-system independent format
 	 */
 	outcount = 0;
 	while (outcount < bytes_wanted) {
 		ino64_t objnum;
 		ushort_t reclen;
 		off64_t *next = NULL;
 
 		/*
 		 * Special case `.', `..', and `.zfs'.
 		 */
 		if (offset == 0) {
 			(void) strcpy(zap->za_name, ".");
 			zap->za_normalization_conflict = 0;
 			objnum = zp->z_id;
 			type = DT_DIR;
 		} else if (offset == 1) {
 			(void) strcpy(zap->za_name, "..");
 			zap->za_normalization_conflict = 0;
 			objnum = parent;
 			type = DT_DIR;
 		} else if (offset == 2 && zfs_show_ctldir(zp)) {
 			(void) strcpy(zap->za_name, ZFS_CTLDIR_NAME);
 			zap->za_normalization_conflict = 0;
 			objnum = ZFSCTL_INO_ROOT;
 			type = DT_DIR;
 		} else {
 			/*
 			 * Grab next entry.
 			 */
 			if ((error = zap_cursor_retrieve(&zc, zap))) {
 				if ((*eofp = (error == ENOENT)) != 0)
 					break;
 				else
 					goto update;
 			}
 
 			if (zap->za_integer_length != 8 ||
 			    zap->za_num_integers != 1) {
 				cmn_err(CE_WARN, "zap_readdir: bad directory "
 				    "entry, obj = %lld, offset = %lld\n",
 				    (u_longlong_t)zp->z_id,
 				    (u_longlong_t)offset);
 				error = SET_ERROR(ENXIO);
 				goto update;
 			}
 
 			objnum = ZFS_DIRENT_OBJ(zap->za_first_integer);
 			/*
 			 * MacOS X can extract the object type here such as:
 			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
 			 */
 			type = ZFS_DIRENT_TYPE(zap->za_first_integer);
 		}
 
 		reclen = DIRENT64_RECLEN(strlen(zap->za_name));
 
 		/*
 		 * Will this entry fit in the buffer?
 		 */
 		if (outcount + reclen > bufsize) {
 			/*
 			 * Did we manage to fit anything in the buffer?
 			 */
 			if (!outcount) {
 				error = SET_ERROR(EINVAL);
 				goto update;
 			}
 			break;
 		}
 		/*
 		 * Add normal entry:
 		 */
 		odp->d_ino = objnum;
 		odp->d_reclen = reclen;
 		odp->d_namlen = strlen(zap->za_name);
 		/* NOTE: d_off is the offset for the *next* entry. */
 		next = &odp->d_off;
 		strlcpy(odp->d_name, zap->za_name, odp->d_namlen + 1);
 		odp->d_type = type;
 		dirent_terminate(odp);
 		odp = (dirent64_t *)((intptr_t)odp + reclen);
 
 		outcount += reclen;
 
 		ASSERT3S(outcount, <=, bufsize);
 
 		if (prefetch)
 			dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ);
 
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
 		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
 			zap_cursor_advance(&zc);
 			offset = zap_cursor_serialize(&zc);
 		} else {
 			offset += 1;
 		}
 
 		/* Fill the offset right after advancing the cursor. */
 		if (next != NULL)
 			*next = offset;
 		if (cooks != NULL) {
 			*cooks++ = offset;
 			ncooks--;
 			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
 		}
 	}
 	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
 
 	/* Subtract unused cookies */
 	if (ncookies != NULL)
 		*ncookies -= ncooks;
 
 	if (zfs_uio_segflg(uio) == UIO_SYSSPACE && zfs_uio_iovcnt(uio) == 1) {
 		iovp->iov_base += outcount;
 		iovp->iov_len -= outcount;
 		zfs_uio_resid(uio) -= outcount;
 	} else if ((error =
 	    zfs_uiomove(outbuf, (long)outcount, UIO_READ, uio))) {
 		/*
 		 * Reset the pointer.
 		 */
 		offset = zfs_uio_offset(uio);
 	}
 
 update:
 	zap_cursor_fini(&zc);
 	zap_attribute_free(zap);
 	if (zfs_uio_segflg(uio) != UIO_SYSSPACE || zfs_uio_iovcnt(uio) != 1)
 		kmem_free(outbuf, bufsize);
 
 	if (error == ENOENT)
 		error = orig_resid == zfs_uio_resid(uio) ? EINVAL : 0;
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	zfs_uio_setoffset(uio, offset);
 	zfs_exit(zfsvfs, FTAG);
 	if (error != 0 && cookies != NULL) {
 		free(*cookies, M_TEMP);
 		*cookies = NULL;
 		*ncookies = 0;
 	}
 	return (error);
 }
 
 /*
  * Get the requested file attributes and place them in the provided
  * vattr structure.
  *
  *	IN:	vp	- vnode of file.
  *		vap	- va_mask identifies requested attributes.
  *			  If AT_XVATTR set, then optional attrs are requested
  *		flags	- ATTR_NOACLCHECK (CIFS server context)
  *		cr	- credentials of caller.
  *
  *	OUT:	vap	- attribute values.
  *
  *	RETURN:	0 (always succeeds).
  */
 static int
 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int	error = 0;
 	uint32_t blksize;
 	u_longlong_t nblocks;
 	uint64_t mtime[2], ctime[2], crtime[2], rdev;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t *xoap = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	sa_bulk_attr_t bulk[4];
 	int count = 0;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
 		    &rdev, 8);
 
 	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
 	 * always be allowed to read basic attributes of file.
 	 */
 	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
 	    (vap->va_uid != crgetuid(cr))) {
 		if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr, NULL))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	/*
 	 * Return all attributes.  It's cheaper to provide the answer
 	 * than to determine whether we were asked the question.
 	 */
 
 	vap->va_type = IFTOVT(zp->z_mode);
 	vap->va_mode = zp->z_mode & ~S_IFMT;
 	vn_fsid(vp, vap);
 	vap->va_nodeid = zp->z_id;
 	vap->va_nlink = zp->z_links;
 	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
 	    zp->z_links < ZFS_LINK_MAX)
 		vap->va_nlink++;
 	vap->va_size = zp->z_size;
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		vap->va_rdev = zfs_cmpldev(rdev);
 	else
 		vap->va_rdev = NODEV;
 	vap->va_gen = zp->z_gen;
 	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
 	vap->va_filerev = zp->z_seq;
 
 	/*
 	 * Add in any requested optional attributes and the create time.
 	 * Also set the corresponding bits in the returned attribute bitmap.
 	 */
 	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
 		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
 			xoap->xoa_archive =
 			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
 			XVA_SET_RTN(xvap, XAT_ARCHIVE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
 			xoap->xoa_readonly =
 			    ((zp->z_pflags & ZFS_READONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_READONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
 			xoap->xoa_system =
 			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
 			XVA_SET_RTN(xvap, XAT_SYSTEM);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
 			xoap->xoa_hidden =
 			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
 			XVA_SET_RTN(xvap, XAT_HIDDEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			xoap->xoa_nounlink =
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
 			XVA_SET_RTN(xvap, XAT_NOUNLINK);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			xoap->xoa_immutable =
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
 			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			xoap->xoa_appendonly =
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
 			XVA_SET_RTN(xvap, XAT_APPENDONLY);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			xoap->xoa_nodump =
 			    ((zp->z_pflags & ZFS_NODUMP) != 0);
 			XVA_SET_RTN(xvap, XAT_NODUMP);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
 			xoap->xoa_opaque =
 			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
 			XVA_SET_RTN(xvap, XAT_OPAQUE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			xoap->xoa_av_quarantined =
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			xoap->xoa_av_modified =
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
 			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
 		    vp->v_type == VREG) {
 			zfs_sa_get_scanstamp(zp, xvap);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_REPARSE);
 		}
 		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
 			xoap->xoa_generation = zp->z_gen;
 			XVA_SET_RTN(xvap, XAT_GEN);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
 			xoap->xoa_offline =
 			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
 			XVA_SET_RTN(xvap, XAT_OFFLINE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
 			xoap->xoa_sparse =
 			    ((zp->z_pflags & ZFS_SPARSE) != 0);
 			XVA_SET_RTN(xvap, XAT_SPARSE);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 			xoap->xoa_projinherit =
 			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0);
 			XVA_SET_RTN(xvap, XAT_PROJINHERIT);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 			xoap->xoa_projid = zp->z_projid;
 			XVA_SET_RTN(xvap, XAT_PROJID);
 		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
 	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
 	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
 	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
 
 
 	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
 	vap->va_blksize = blksize;
 	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
 
 	if (zp->z_blksz == 0) {
 		/*
 		 * Block size hasn't been set; suggest maximal I/O transfers.
 		 */
 		vap->va_blksize = zfsvfs->z_max_blksz;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 /*
  * For the operation of changing file's user/group/project, we need to
  * handle not only the main object that is assigned to the file directly,
  * but also the ones that are used by the file via hidden xattr directory.
  *
  * Because the xattr directory may contains many EA entries, as to it may
  * be impossible to change all of them via the transaction of changing the
  * main object's user/group/project attributes. Then we have to change them
  * via other multiple independent transactions one by one. It may be not good
  * solution, but we have no better idea yet.
  */
 static int
 zfs_setattr_dir(znode_t *dzp)
 {
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	objset_t	*os = zfsvfs->z_os;
 	zap_cursor_t	zc;
 	zap_attribute_t	*zap;
 	znode_t		*zp = NULL;
 	dmu_tx_t	*tx = NULL;
 	uint64_t	uid, gid;
 	sa_bulk_attr_t	bulk[4];
 	int		count;
 	int		err;
 
 	zap = zap_attribute_alloc();
 	zap_cursor_init(&zc, os, dzp->z_id);
 	while ((err = zap_cursor_retrieve(&zc, zap)) == 0) {
 		count = 0;
 		if (zap->za_integer_length != 8 || zap->za_num_integers != 1) {
 			err = ENXIO;
 			break;
 		}
 
 		err = zfs_dirent_lookup(dzp, zap->za_name, &zp, ZEXISTS);
 		if (err == ENOENT)
 			goto next;
 		if (err)
 			break;
 
 		if (zp->z_uid == dzp->z_uid &&
 		    zp->z_gid == dzp->z_gid &&
 		    zp->z_projid == dzp->z_projid)
 			goto next;
 
 		tx = dmu_tx_create(os);
 		if (!(zp->z_pflags & ZFS_PROJID))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 
 		err = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (err)
 			break;
 
 		mutex_enter(&dzp->z_lock);
 
 		if (zp->z_uid != dzp->z_uid) {
 			uid = dzp->z_uid;
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &uid, sizeof (uid));
 			zp->z_uid = uid;
 		}
 
 		if (zp->z_gid != dzp->z_gid) {
 			gid = dzp->z_gid;
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
 			    &gid, sizeof (gid));
 			zp->z_gid = gid;
 		}
 
 		uint64_t projid = dzp->z_projid;
 		if (zp->z_projid != projid) {
 			if (!(zp->z_pflags & ZFS_PROJID)) {
 				err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 				if (unlikely(err == EEXIST)) {
 					err = 0;
 				} else if (err != 0) {
 					goto sa_add_projid_err;
 				} else {
 					projid = ZFS_INVALID_PROJID;
 				}
 			}
 
 			if (projid != ZFS_INVALID_PROJID) {
 				zp->z_projid = projid;
 				SA_ADD_BULK_ATTR(bulk, count,
 				    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 				    sizeof (zp->z_projid));
 			}
 		}
 
 sa_add_projid_err:
 		mutex_exit(&dzp->z_lock);
 
 		if (likely(count > 0)) {
 			err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 			dmu_tx_commit(tx);
 		} else if (projid == ZFS_INVALID_PROJID) {
 			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
 		tx = NULL;
 		if (err != 0 && err != ENOENT)
 			break;
 
 next:
 		if (zp) {
 			zrele(zp);
 			zp = NULL;
 		}
 		zap_cursor_advance(&zc);
 	}
 
 	if (tx)
 		dmu_tx_abort(tx);
 	if (zp) {
 		zrele(zp);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(zap);
 
 	return (err == ENOENT ? 0 : err);
 }
 
 /*
  * Set the file attributes to the values contained in the
  * vattr structure.
  *
  *	IN:	zp	- znode of file to be modified.
  *		vap	- new attribute values.
  *			  If AT_XVATTR set, then optional attrs are being set
  *		flags	- ATTR_UTIME set if non-default time values provided.
  *			- ATTR_NOACLCHECK (CIFS context only).
  *		cr	- credentials of caller.
  *		mnt_ns	- Unused on FreeBSD
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - ctime updated, mtime updated if size changed.
  */
 int
 zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns)
 {
 	vnode_t		*vp = ZTOV(zp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	objset_t	*os;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
 	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask = 0;
 	uint64_t	saved_mode;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
 	uint64_t	new_uid, new_gid;
 	uint64_t	xattr_obj;
 	uint64_t	mtime[2], ctime[2];
 	uint64_t	projid = ZFS_INVALID_PROJID;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err, err2;
 	zfs_fuid_info_t *fuidp = NULL;
 	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
 	boolean_t	fuid_dirtied = B_FALSE;
 	boolean_t	handle_eadir = B_FALSE;
 	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
 	int		count = 0, xattr_count = 0;
 
 	if (mask == 0)
 		return (0);
 
 	if (mask & AT_NOSET)
 		return (SET_ERROR(EINVAL));
 
 	if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (err);
 
 	os = zfsvfs->z_os;
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * Make sure that if we have ephemeral uid/gid or xvattr specified
 	 * that file system is at proper version level
 	 */
 
 	if (zfsvfs->z_use_fuids == B_FALSE &&
 	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
 	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
 	    (mask & AT_XVATTR))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (mask & AT_SIZE && vp->v_type == VDIR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EISDIR));
 	}
 
 	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If this is an xvattr_t, then get a pointer to the structure of
 	 * optional attributes.  If this is NULL, then we have a vattr_t.
 	 */
 	xoap = xva_getxoptattr(xvap);
 
 	xva_init(&tmpxvattr);
 
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
 	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
 	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
 	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
 	 */
 
 	/*
 	 * Verify timestamps doesn't overflow 32 bits.
 	 * ZFS can handle large timestamps, but 32bit syscalls can't
 	 * handle times greater than 2039.  This check should be removed
 	 * once large timestamps are fully supported.
 	 */
 	if (mask & (AT_ATIME | AT_MTIME)) {
 		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
 		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOVERFLOW));
 		}
 	}
 	if (xoap != NULL && (mask & AT_XVATTR)) {
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
 		    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOVERFLOW));
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 			if (!dmu_objset_projectquota_enabled(os) ||
 			    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(EOPNOTSUPP));
 			}
 
 			projid = xoap->xoa_projid;
 			if (unlikely(projid == ZFS_INVALID_PROJID)) {
 				zfs_exit(zfsvfs, FTAG);
 				return (SET_ERROR(EINVAL));
 			}
 
 			if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID)
 				projid = ZFS_INVALID_PROJID;
 			else
 				need_policy = TRUE;
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) &&
 		    (xoap->xoa_projinherit !=
 		    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) &&
 		    (!dmu_objset_projectquota_enabled(os) ||
 		    (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EOPNOTSUPP));
 		}
 	}
 
 	attrzp = NULL;
 	aclp = NULL;
 
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * First validate permissions
 	 */
 
 	if (mask & AT_SIZE) {
 		/*
 		 * XXX - Note, we are not providing any open
 		 * mode flags here (like FNDELAY), so we may
 		 * block if there are locks present... this
 		 * should be addressed in openat().
 		 */
 		/* XXX - would it be OK to generate a log record here? */
 		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
 		if (err) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
 		}
 	}
 
 	if (mask & (AT_ATIME|AT_MTIME) ||
 	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
 	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
 	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
 	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
 	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
 	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
 	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
 		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
 		    skipaclchk, cr, mnt_ns);
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 		int	idmask = (mask & (AT_UID|AT_GID));
 		int	take_owner;
 		int	take_group;
 
 		/*
 		 * NOTE: even if a new mode is being set,
 		 * we may clear S_ISUID/S_ISGID bits.
 		 */
 
 		if (!(mask & AT_MODE))
 			vap->va_mode = zp->z_mode;
 
 		/*
 		 * Take ownership or chgrp to group we are a member of
 		 */
 
 		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
 		take_group = (mask & AT_GID) &&
 		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
 
 		/*
 		 * If both AT_UID and AT_GID are set then take_owner and
 		 * take_group must both be set in order to allow taking
 		 * ownership.
 		 *
 		 * Otherwise, send the check through secpolicy_vnode_setattr()
 		 *
 		 */
 
 		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
 		    ((idmask == AT_UID) && take_owner) ||
 		    ((idmask == AT_GID) && take_group)) {
 			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
 			    skipaclchk, cr, mnt_ns) == 0) {
 				/*
 				 * Remove setuid/setgid for non-privileged users
 				 */
 				secpolicy_setid_clear(vap, vp, cr);
 				trim_mask = (mask & (AT_UID|AT_GID));
 			} else {
 				need_policy =  TRUE;
 			}
 		} else {
 			need_policy =  TRUE;
 		}
 	}
 
 	oldva.va_mode = zp->z_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
 		/*
 		 * Update xvattr mask to include only those attributes
 		 * that are actually changing.
 		 *
 		 * the bits will be restored prior to actually setting
 		 * the attributes so the caller thinks they were set.
 		 */
 		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
 			if (xoap->xoa_appendonly !=
 			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
 				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) {
 			if (xoap->xoa_projinherit !=
 			    ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_PROJINHERIT);
 				XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
 			if (xoap->xoa_nounlink !=
 			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
 				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
 			if (xoap->xoa_immutable !=
 			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
 				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
 			if (xoap->xoa_nodump !=
 			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_NODUMP);
 				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
 			if (xoap->xoa_av_modified !=
 			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
 			if ((vp->v_type != VREG &&
 			    xoap->xoa_av_quarantined) ||
 			    xoap->xoa_av_quarantined !=
 			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
 				need_policy = TRUE;
 			} else {
 				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
 				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
 			}
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
 			zfs_exit(zfsvfs, FTAG);
 			return (SET_ERROR(EPERM));
 		}
 
 		if (need_policy == FALSE &&
 		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
 		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
 			need_policy = TRUE;
 		}
 	}
 
 	if (mask & AT_MODE) {
 		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr,
 		    mnt_ns) == 0) {
 			err = secpolicy_setid_setsticky_clear(vp, vap,
 			    &oldva, cr);
 			if (err) {
 				zfs_exit(zfsvfs, FTAG);
 				return (err);
 			}
 			trim_mask |= AT_MODE;
 		} else {
 			need_policy = TRUE;
 		}
 	}
 
 	if (need_policy) {
 		/*
 		 * If trim_mask is set then take ownership
 		 * has been granted or write_acl is present and user
 		 * has the ability to modify mode.  In that case remove
 		 * UID|GID and or MODE from mask so that
 		 * secpolicy_vnode_setattr() doesn't revoke it.
 		 */
 
 		if (trim_mask) {
 			saved_mask = vap->va_mask;
 			vap->va_mask &= ~trim_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Save the mode, as secpolicy_vnode_setattr()
 				 * will overwrite it with ova.va_mode.
 				 */
 				saved_mode = vap->va_mode;
 			}
 		}
 		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
 		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
 		if (err) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
 		}
 
 		if (trim_mask) {
 			vap->va_mask |= saved_mask;
 			if (trim_mask & AT_MODE) {
 				/*
 				 * Recover the mode after
 				 * secpolicy_vnode_setattr().
 				 */
 				vap->va_mode = saved_mode;
 			}
 		}
 	}
 
 	/*
 	 * secpolicy_vnode_setattr, or take ownership may have
 	 * changed va_mask
 	 */
 	mask = vap->va_mask;
 
 	if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) {
 		handle_eadir = B_TRUE;
 		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 		    &xattr_obj, sizeof (xattr_obj));
 
 		if (err == 0 && xattr_obj) {
 			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
 			if (err == 0) {
 				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
 				if (err != 0)
 					vrele(ZTOV(attrzp));
 			}
 			if (err)
 				goto out2;
 		}
 		if (mask & AT_UID) {
 			new_uid = zfs_fuid_create(zfsvfs,
 			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
 			if (new_uid != zp->z_uid &&
 			    zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT,
 			    new_uid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (mask & AT_GID) {
 			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
 			    cr, ZFS_GROUP, &fuidp);
 			if (new_gid != zp->z_gid &&
 			    zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT,
 			    new_gid)) {
 				if (attrzp)
 					vput(ZTOV(attrzp));
 				err = SET_ERROR(EDQUOT);
 				goto out2;
 			}
 		}
 
 		if (projid != ZFS_INVALID_PROJID &&
 		    zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) {
 			if (attrzp)
 				vput(ZTOV(attrzp));
 			err = SET_ERROR(EDQUOT);
 			goto out2;
 		}
 	}
 	tx = dmu_tx_create(os);
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = zp->z_mode;
 		uint64_t acl_obj;
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
 		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
 		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
 			err = SET_ERROR(EPERM);
 			goto out;
 		}
 
 		if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)))
 			goto out;
 
 		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
 			/*
 			 * Are we upgrading ACL from old V0 format
 			 * to V1 format?
 			 */
 			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
 			    zfs_znode_acl_version(zp) ==
 			    ZFS_ACL_VERSION_INITIAL) {
 				dmu_tx_hold_free(tx, acl_obj, 0,
 				    DMU_OBJECT_END);
 				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 				    0, aclp->z_acl_bytes);
 			} else {
 				dmu_tx_hold_write(tx, acl_obj, 0,
 				    aclp->z_acl_bytes);
 			}
 		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, aclp->z_acl_bytes);
 		}
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	} else {
 		if (((mask & AT_XVATTR) &&
 		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
 		    (projid != ZFS_INVALID_PROJID &&
 		    !(zp->z_pflags & ZFS_PROJID)))
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 		else
 			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	}
 
 	if (attrzp) {
 		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
 	}
 
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err)
 		goto out;
 
 	count = 0;
 	/*
 	 * Set each attribute requested.
 	 * We group settings according to the locks they need to acquire.
 	 *
 	 * Note: you cannot set ctime directly, although it will be
 	 * updated as a side-effect of calling this function.
 	 */
 
 	if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) {
 		/*
 		 * For the existed object that is upgraded from old system,
 		 * its on-disk layout has no slot for the project ID attribute.
 		 * But quota accounting logic needs to access related slots by
 		 * offset directly. So we need to adjust old objects' layout
 		 * to make the project ID to some unified and fixed offset.
 		 */
 		if (attrzp)
 			err = sa_add_projid(attrzp->z_sa_hdl, tx, projid);
 		if (err == 0)
 			err = sa_add_projid(zp->z_sa_hdl, tx, projid);
 
 		if (unlikely(err == EEXIST))
 			err = 0;
 		else if (err != 0)
 			goto out;
 		else
 			projid = ZFS_INVALID_PROJID;
 	}
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_enter(&zp->z_acl_lock);
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_enter(&attrzp->z_acl_lock);
 		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
 		    sizeof (attrzp->z_pflags));
 		if (projid != ZFS_INVALID_PROJID) {
 			attrzp->z_projid = projid;
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid,
 			    sizeof (attrzp->z_projid));
 		}
 	}
 
 	if (mask & (AT_UID|AT_GID)) {
 
 		if (mask & AT_UID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
 			    &new_uid, sizeof (new_uid));
 			zp->z_uid = new_uid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
 				    sizeof (new_uid));
 				attrzp->z_uid = new_uid;
 			}
 		}
 
 		if (mask & AT_GID) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
 			    NULL, &new_gid, sizeof (new_gid));
 			zp->z_gid = new_gid;
 			if (attrzp) {
 				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
 				    sizeof (new_gid));
 				attrzp->z_gid = new_gid;
 			}
 		}
 		if (!(mask & AT_MODE)) {
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
 			    NULL, &new_mode, sizeof (new_mode));
 			new_mode = zp->z_mode;
 		}
 		err = zfs_acl_chown_setattr(zp);
 		ASSERT0(err);
 		if (attrzp) {
 			vn_seqc_write_begin(ZTOV(attrzp));
 			err = zfs_acl_chown_setattr(attrzp);
 			vn_seqc_write_end(ZTOV(attrzp));
 			ASSERT0(err);
 		}
 	}
 
 	if (mask & AT_MODE) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
 		    &new_mode, sizeof (new_mode));
 		zp->z_mode = new_mode;
 		ASSERT3P(aclp, !=, NULL);
 		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT0(err);
 		if (zp->z_acl_cached)
 			zfs_acl_free(zp->z_acl_cached);
 		zp->z_acl_cached = aclp;
 		aclp = NULL;
 	}
 
 
 	if (mask & AT_ATIME) {
 		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
 		    &zp->z_atime, sizeof (zp->z_atime));
 	}
 
 	if (mask & AT_MTIME) {
 		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    mtime, sizeof (mtime));
 	}
 
 	if (projid != ZFS_INVALID_PROJID) {
 		zp->z_projid = projid;
 		SA_ADD_BULK_ATTR(bulk, count,
 		    SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid,
 		    sizeof (zp->z_projid));
 	}
 
 	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
 	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 		    NULL, mtime, sizeof (mtime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 	} else if (mask != 0) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime);
 		if (attrzp) {
 			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
 			    SA_ZPL_CTIME(zfsvfs), NULL,
 			    &ctime, sizeof (ctime));
 			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
 			    mtime, ctime);
 		}
 	}
 
 	/*
 	 * Do this after setting timestamps to prevent timestamp
 	 * update from toggling bit
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
 
 		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 			xoap->xoa_createtime = vap->va_birthtime;
 		/*
 		 * restore trimmed off masks
 		 * so that return masks can be set for caller.
 		 */
 
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
 			XVA_SET_REQ(xvap, XAT_APPENDONLY);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
 			XVA_SET_REQ(xvap, XAT_NOUNLINK);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
 			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
 			XVA_SET_REQ(xvap, XAT_NODUMP);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
 			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
 			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
 		}
 		if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) {
 			XVA_SET_REQ(xvap, XAT_PROJINHERIT);
 		}
 
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 			ASSERT3S(vp->v_type, ==, VREG);
 
 		zfs_xvattr_set(zp, xvap, tx);
 	}
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
 	if (mask & (AT_UID|AT_GID|AT_MODE))
 		mutex_exit(&zp->z_acl_lock);
 
 	if (attrzp) {
 		if (mask & (AT_UID|AT_GID|AT_MODE))
 			mutex_exit(&attrzp->z_acl_lock);
 	}
 out:
 	if (err == 0 && attrzp) {
 		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
 		    xattr_count, tx);
 		ASSERT0(err2);
 	}
 
 	if (attrzp)
 		vput(ZTOV(attrzp));
 
 	if (aclp)
 		zfs_acl_free(aclp);
 
 	if (fuidp) {
 		zfs_fuid_info_free(fuidp);
 		fuidp = NULL;
 	}
 
 	if (err) {
 		dmu_tx_abort(tx);
 	} else {
 		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		dmu_tx_commit(tx);
 		if (attrzp) {
 			if (err2 == 0 && handle_eadir)
 				err = zfs_setattr_dir(attrzp);
 		}
 	}
 
 out2:
 	if (err == 0 && os->os_sync == ZFS_SYNC_ALWAYS)
 		err = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (err);
 }
 
 /*
  * Look up the directory entries corresponding to the source and target
  * directory/name pairs.
  */
 static int
 zfs_rename_relock_lookup(znode_t *sdzp, const struct componentname *scnp,
     znode_t **szpp, znode_t *tdzp, const struct componentname *tcnp,
     znode_t **tzpp)
 {
 	zfsvfs_t *zfsvfs;
 	znode_t *szp, *tzp;
 	int error;
 
 	/*
 	 * Before using sdzp and tdzp we must ensure that they are live.
 	 * As a porting legacy from illumos we have two things to worry
 	 * about.  One is typical for FreeBSD and it is that the vnode is
 	 * not reclaimed (doomed).  The other is that the znode is live.
 	 * The current code can invalidate the znode without acquiring the
 	 * corresponding vnode lock if the object represented by the znode
 	 * and vnode is no longer valid after a rollback or receive operation.
 	 * z_teardown_lock hidden behind zfs_enter and zfs_exit is the lock
 	 * that protects the znodes from the invalidation.
 	 */
 	zfsvfs = sdzp->z_zfsvfs;
 	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
 	if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0)
 		return (error);
 	if ((error = zfs_verify_zp(tdzp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Re-resolve svp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = zfs_dirent_lookup(sdzp, scnp->cn_nameptr, &szp, ZEXISTS);
 	if (error != 0) {
 		/* Source entry invalid or not there. */
 		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
 		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	*szpp = szp;
 
 	/*
 	 * Re-resolve tvp, if it disappeared we just carry on.
 	 */
 	error = zfs_dirent_lookup(tdzp, tcnp->cn_nameptr, &tzp, 0);
 	if (error != 0) {
 		vrele(ZTOV(szp));
 		if ((tcnp->cn_flags & ISDOTDOT) != 0)
 			error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	*tzpp = tzp;
 out:
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * We acquire all but fdvp locks using non-blocking acquisitions.  If we
  * fail to acquire any lock in the path we will drop all held locks,
  * acquire the new lock in a blocking fashion, and then release it and
  * restart the rename.  This acquire/release step ensures that we do not
  * spin on a lock waiting for release.  On error release all vnode locks
  * and decrement references the way tmpfs_rename() would do.
  */
 static int
 zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
     struct vnode *tdvp, struct vnode **tvpp,
     const struct componentname *scnp, const struct componentname *tcnp)
 {
 	struct vnode	*nvp, *svp, *tvp;
 	znode_t		*sdzp, *tdzp, *szp, *tzp;
 	int		error;
 
 	VOP_UNLOCK(tdvp);
 	if (*tvpp != NULL && *tvpp != tdvp)
 		VOP_UNLOCK(*tvpp);
 
 relock:
 	error = vn_lock(sdvp, LK_EXCLUSIVE);
 	if (error)
 		goto out;
 	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp);
 		if (error != EBUSY)
 			goto out;
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto out;
 		VOP_UNLOCK(tdvp);
 		goto relock;
 	}
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 
 	error = zfs_rename_relock_lookup(sdzp, scnp, &szp, tdzp, tcnp, &tzp);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp);
 		VOP_UNLOCK(tdvp);
 		goto out;
 	}
 	svp = ZTOV(szp);
 	tvp = tzp != NULL ? ZTOV(tzp) : NULL;
 
 	/*
 	 * Now try acquire locks on svp and tvp.
 	 */
 	nvp = svp;
 	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 	if (error != 0) {
 		VOP_UNLOCK(sdvp);
 		VOP_UNLOCK(tdvp);
 		if (tvp != NULL)
 			vrele(tvp);
 		if (error != EBUSY) {
 			vrele(nvp);
 			goto out;
 		}
 		error = vn_lock(nvp, LK_EXCLUSIVE);
 		if (error != 0) {
 			vrele(nvp);
 			goto out;
 		}
 		VOP_UNLOCK(nvp);
 		/*
 		 * Concurrent rename race.
 		 * XXX ?
 		 */
 		if (nvp == tdvp) {
 			vrele(nvp);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 		vrele(*svpp);
 		*svpp = nvp;
 		goto relock;
 	}
 	vrele(*svpp);
 	*svpp = nvp;
 
 	if (*tvpp != NULL)
 		vrele(*tvpp);
 	*tvpp = NULL;
 	if (tvp != NULL) {
 		nvp = tvp;
 		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
 		if (error != 0) {
 			VOP_UNLOCK(sdvp);
 			VOP_UNLOCK(tdvp);
 			VOP_UNLOCK(*svpp);
 			if (error != EBUSY) {
 				vrele(nvp);
 				goto out;
 			}
 			error = vn_lock(nvp, LK_EXCLUSIVE);
 			if (error != 0) {
 				vrele(nvp);
 				goto out;
 			}
 			vput(nvp);
 			goto relock;
 		}
 		*tvpp = nvp;
 	}
 
 	return (0);
 
 out:
 	return (error);
 }
 
 /*
  * Note that we must use VRELE_ASYNC in this function as it walks
  * up the directory tree and vrele may need to acquire an exclusive
  * lock if a last reference to a vnode is dropped.
  */
 static int
 zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
 {
 	zfsvfs_t	*zfsvfs;
 	znode_t		*zp, *zp1;
 	uint64_t	parent;
 	int		error;
 
 	zfsvfs = tdzp->z_zfsvfs;
 	if (tdzp == szp)
 		return (SET_ERROR(EINVAL));
 	if (tdzp == sdzp)
 		return (0);
 	if (tdzp->z_id == zfsvfs->z_root)
 		return (0);
 	zp = tdzp;
 	for (;;) {
 		ASSERT(!zp->z_unlinked);
 		if ((error = sa_lookup(zp->z_sa_hdl,
 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 			break;
 
 		if (parent == szp->z_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		if (parent == zfsvfs->z_root)
 			break;
 		if (parent == sdzp->z_id)
 			break;
 
 		error = zfs_zget(zfsvfs, parent, &zp1);
 		if (error != 0)
 			break;
 
 		if (zp != tdzp)
 			VN_RELE_ASYNC(ZTOV(zp),
 			    dsl_pool_zrele_taskq(
 			    dmu_objset_pool(zfsvfs->z_os)));
 		zp = zp1;
 	}
 
 	if (error == ENOTDIR)
 		panic("checkpath: .. not a directory\n");
 	if (zp != tdzp)
 		VN_RELE_ASYNC(ZTOV(zp),
 		    dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
 	return (error);
 }
 
 static int
 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr);
 
 /*
  * Move an entry from the provided source directory to the target
  * directory.  Change the entry name as indicated.
  *
  *	IN:	sdvp	- Source directory containing the "old entry".
  *		scnp	- Old entry name.
  *		tdvp	- Target directory to contain the "new entry".
  *		tcnp	- New entry name.
  *		cr	- credentials of caller.
  *	INOUT:	svpp	- Source file
  *		tvpp	- Target file, may point to NULL initially
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	sdvp,tdvp - ctime|mtime updated
  */
 static int
 zfs_do_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	int	error;
 
 	ASSERT_VOP_ELOCKED(tdvp, __func__);
 	if (*tvpp != NULL)
 		ASSERT_VOP_ELOCKED(*tvpp, __func__);
 
 	/* Reject renames across filesystems. */
 	if ((*svpp)->v_mount != tdvp->v_mount ||
 	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	if (zfsctl_is_node(tdvp)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Lock all four vnodes to ensure safety and semantics of renaming.
 	 */
 	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
 	if (error != 0) {
 		/* no vnodes are locked in the case of error here */
 		return (error);
 	}
 
 	error = zfs_do_rename_impl(sdvp, svpp, scnp, tdvp, tvpp, tcnp, cr);
 	VOP_UNLOCK(sdvp);
 	VOP_UNLOCK(*svpp);
 out:
 	if (*tvpp != NULL)
 		VOP_UNLOCK(*tvpp);
 	if (tdvp != *tvpp)
 		VOP_UNLOCK(tdvp);
 
 	return (error);
 }
 
 static int
 zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
     vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
     cred_t *cr)
 {
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs;
 	zilog_t		*zilog;
 	znode_t		*tdzp, *sdzp, *tzp, *szp;
 	const char	*snm = scnp->cn_nameptr;
 	const char	*tnm = tcnp->cn_nameptr;
 	int		error;
 
 	tdzp = VTOZ(tdvp);
 	sdzp = VTOZ(sdvp);
 	zfsvfs = tdzp->z_zfsvfs;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 		return (error);
 	if ((error = zfs_verify_zp(sdzp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(tnm,
 	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		error = SET_ERROR(EILSEQ);
 		goto out;
 	}
 
 	/* If source and target are the same file, there is nothing to do. */
 	if ((*svpp) == (*tvpp)) {
 		error = 0;
 		goto out;
 	}
 
 	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
 	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
 	    (*tvpp)->v_mountedhere != NULL)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	szp = VTOZ(*svpp);
 	if ((error = zfs_verify_zp(szp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
 	if (tzp != NULL) {
 		if ((error = zfs_verify_zp(tzp)) != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (error);
 		}
 	}
 
 	/*
 	 * This is to prevent the creation of links into attribute space
 	 * by renaming a linked file into/outof an attribute directory.
 	 * See the comment in zfs_link() for why this is considered bad.
 	 */
 	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow renames into our tree when the project
 	 * IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 
 	/*
 	 * Must have write access at the source to remove the old entry
 	 * and write access at the target to create the new entry.
 	 * Note that if target and source are the same, this can be
 	 * done in a single check.
 	 */
 	if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, NULL)))
 		goto out;
 
 	if ((*svpp)->v_type == VDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
 		    sdzp == szp ||
 		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * Check to make sure rename is valid.
 		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
 		 */
 		if ((error = zfs_rename_check(szp, sdzp, tdzp)))
 			goto out;
 	}
 
 	/*
 	 * Does target exist?
 	 */
 	if (tzp) {
 		/*
 		 * Source and target must be the same type.
 		 */
 		if ((*svpp)->v_type == VDIR) {
 			if ((*tvpp)->v_type != VDIR) {
 				error = SET_ERROR(ENOTDIR);
 				goto out;
 			} else {
 				cache_purge(tdvp);
 				if (sdvp != tdvp)
 					cache_purge(sdvp);
 			}
 		} else {
 			if ((*tvpp)->v_type == VDIR) {
 				error = SET_ERROR(EISDIR);
 				goto out;
 			}
 		}
 	}
 
 	vn_seqc_write_begin(*svpp);
 	vn_seqc_write_begin(sdvp);
 	if (*tvpp != NULL)
 		vn_seqc_write_begin(*tvpp);
 	if (tdvp != *tvpp)
 		vn_seqc_write_begin(tdvp);
 
 	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
 	if (tzp)
 		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
 
 	/*
 	 * notify the target directory if it is not the same
 	 * as source directory.
 	 */
 	if (tdvp != sdvp) {
 		vnevent_rename_dest_dir(tdvp, ct);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
 	if (sdzp != tdzp) {
 		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tdzp);
 	}
 	if (tzp) {
 		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, tzp);
 	}
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		goto out_seq;
 	}
 
 	if (tzp)	/* Attempt to remove the existing target */
 		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
 
 	if (error == 0) {
 		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
 		if (error == 0) {
 			szp->z_pflags |= ZFS_AV_MODIFIED;
 
 			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
 			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
 			ASSERT0(error);
 
 			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
 			    NULL);
 			if (error == 0) {
 				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
 				    snm, tdzp, tnm, szp);
 			} else {
 				/*
 				 * At this point, we have successfully created
 				 * the target name, but have failed to remove
 				 * the source name.  Since the create was done
 				 * with the ZRENAMING flag, there are
 				 * complications; for one, the link count is
 				 * wrong.  The easiest way to deal with this
 				 * is to remove the newly created target, and
 				 * return the original error.  This must
 				 * succeed; fortunately, it is very unlikely to
 				 * fail, since we just created it.
 				 */
 				VERIFY0(zfs_link_destroy(tdzp, tnm, szp, tx,
 				    ZRENAMING, NULL));
 			}
 		}
 		if (error == 0) {
 			cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp);
 		}
 	}
 
 	dmu_tx_commit(tx);
 
 out_seq:
 	vn_seqc_write_end(*svpp);
 	vn_seqc_write_end(sdvp);
 	if (*tvpp != NULL)
 		vn_seqc_write_end(*tvpp);
 	if (tdvp != *tvpp)
 		vn_seqc_write_end(tdvp);
 
 out:
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		error = zil_commit(zilog, 0);
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 int
 zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname,
     cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns)
 {
 	struct componentname scn, tcn;
 	vnode_t *sdvp, *tdvp;
 	vnode_t *svp, *tvp;
 	int error;
 	svp = tvp = NULL;
 
 	if (is_nametoolong(tdzp->z_zfsvfs, tname))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	if (rflags != 0 || wo_vap != NULL)
 		return (SET_ERROR(EINVAL));
 
 	sdvp = ZTOV(sdzp);
 	tdvp = ZTOV(tdzp);
 	error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE);
 	if (sdzp->z_zfsvfs->z_replay == B_FALSE)
 		VOP_UNLOCK(sdvp);
 	if (error != 0)
 		goto fail;
 	VOP_UNLOCK(svp);
 
 	vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY);
 	error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME);
 	if (error == EJUSTRETURN)
 		tvp = NULL;
 	else if (error != 0) {
 		VOP_UNLOCK(tdvp);
 		goto fail;
 	}
 
 	error = zfs_do_rename(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr);
 fail:
 	if (svp != NULL)
 		vrele(svp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 /*
  * Insert the indicated symbolic reference entry into the directory.
  *
  *	IN:	dvp	- Directory to contain new symbolic link.
  *		link	- Name for new symlink entry.
  *		vap	- Attributes of new entry.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *		flags	- case flags
  *		mnt_ns	- Unused on FreeBSD
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	dvp - ctime|mtime updated
  */
 int
 zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap,
     const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns)
 {
 	(void) flags;
 	znode_t		*zp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zilog_t		*zilog;
 	uint64_t	len = strlen(link);
 	int		error;
 	zfs_acl_ids_t	acl_ids;
 	boolean_t	fuid_dirtied;
 	uint64_t	txtype = TX_SYMLINK;
 
 	ASSERT3S(vap->va_type, ==, VLNK);
 
 	if (is_nametoolong(zfsvfs, name))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
 	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	if (len > MAXPATHLEN) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	if ((error = zfs_acl_ids_create(dzp, 0,
 	    vap, cr, NULL, &acl_ids, NULL)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) {
 		zfs_acl_ids_free(&acl_ids);
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	getnewvnode_reserve();
 	tx = dmu_tx_create(zfsvfs->z_os);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE + len);
 	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
 		    acl_ids.z_aclp->z_acl_bytes);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		getnewvnode_drop_reserve();
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Create a new object for the symlink.
 	 * for version 4 ZPL datasets the symlink will be an SA attribute
 	 */
 	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 	if (zp->z_is_sa)
 		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
 		    __DECONST(void *, link), len, tx);
 	else
 		zfs_sa_symlink(zp, __DECONST(char *, link), len, tx);
 
 	zp->z_size = len;
 	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 	    &zp->z_size, sizeof (zp->z_size), tx);
 	/*
 	 * Insert the new object into the directory.
 	 */
 	error = zfs_link_create(dzp, name, zp, tx, ZNEW);
 	if (error != 0) {
 		zfs_znode_delete(zp, tx);
 		VOP_UNLOCK(ZTOV(zp));
 		zrele(zp);
 	} else {
 		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	}
 
 	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
 	getnewvnode_drop_reserve();
 
 	if (error == 0) {
 		*zpp = zp;
 
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 			error = zil_commit(zilog, 0);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Return, in the buffer contained in the provided uio structure,
  * the symbolic path referred to by vp.
  *
  *	IN:	vp	- vnode of symbolic link.
  *		uio	- structure to contain the link path.
  *		cr	- credentials of caller.
  *		ct	- caller context
  *
  *	OUT:	uio	- structure containing the link path.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	vp - atime updated
  */
 static int
 zfs_readlink(vnode_t *vp, zfs_uio_t *uio, cred_t *cr, caller_context_t *ct)
 {
 	(void) cr, (void) ct;
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (zp->z_is_sa)
 		error = sa_lookup_uio(zp->z_sa_hdl,
 		    SA_ZPL_SYMLINK(zfsvfs), uio);
 	else
 		error = zfs_sa_readlink(zp, uio);
 
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Insert a new entry into directory tdvp referencing svp.
  *
  *	IN:	tdvp	- Directory to contain new entry.
  *		svp	- vnode of new entry.
  *		name	- name of new entry.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	tdvp - ctime|mtime updated
  *	 svp - ctime updated
  */
 int
 zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr,
     int flags)
 {
 	(void) flags;
 	znode_t		*tzp;
 	zfsvfs_t	*zfsvfs = tdzp->z_zfsvfs;
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	int		error;
 	uint64_t	parent;
 	uid_t		owner;
 
 	ASSERT3S(ZTOV(tdzp)->v_type, ==, VDIR);
 
 	if (is_nametoolong(zfsvfs, name))
 		return (SET_ERROR(ENAMETOOLONG));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0)
 		return (error);
 	zilog = zfsvfs->z_log;
 
 	/*
 	 * POSIX dictates that we return EPERM here.
 	 * Better choices include ENOTSUP or EISDIR.
 	 */
 	if (ZTOV(szp)->v_type == VDIR) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_verify_zp(szp)) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * If we are using project inheritance, means if the directory has
 	 * ZFS_PROJINHERIT set, then its descendant directories will inherit
 	 * not only the project ID, but also the ZFS_PROJINHERIT flag. Under
 	 * such case, we only allow hard link creation in our tree when the
 	 * project IDs are the same.
 	 */
 	if (tdzp->z_pflags & ZFS_PROJINHERIT &&
 	    tdzp->z_projid != szp->z_projid) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EXDEV));
 	}
 
 	if (szp->z_pflags & (ZFS_APPENDONLY |
 	    ZFS_IMMUTABLE | ZFS_READONLY)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	/* Prevent links to .zfs/shares files */
 
 	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	if (parent == zfsvfs->z_shares_dir) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if (zfsvfs->z_utf8 && u8_validate(name,
 	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EILSEQ));
 	}
 
 	/*
 	 * We do not support links between attributes and non-attributes
 	 * because of the potential security risk of creating links
 	 * into "normal" file space in order to circumvent restrictions
 	 * imposed in attribute space.
 	 */
 	if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 
 	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
 	if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EPERM));
 	}
 
 	if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, NULL))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	/*
 	 * Attempt to lock directory; fail if entry already exists.
 	 */
 	error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
 	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, tdzp);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	error = zfs_link_create(tdzp, name, szp, tx, 0);
 
 	if (error == 0) {
 		uint64_t txtype = TX_LINK;
 		zfs_log_link(zilog, tx, txtype, tdzp, szp, name);
 	}
 
 	dmu_tx_commit(tx);
 
 	if (error == 0) {
 		vnevent_link(ZTOV(szp), ct);
 	}
 
 	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 		error = zil_commit(zilog, 0);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 /*
  * Free or allocate space in a file.  Currently, this function only
  * supports the `F_FREESP' command.  However, this command is somewhat
  * misnamed, as its functionality includes the ability to allocate as
  * well as free space.
  *
  *	IN:	ip	- inode of file to free data in.
  *		cmd	- action to take (only F_FREESP supported).
  *		bfp	- section of file to free/alloc.
  *		flag	- current file open mode flags.
  *		offset	- current file offset.
  *		cr	- credentials of caller.
  *
  *	RETURN:	0 on success, error code on failure.
  *
  * Timestamps:
  *	ip - ctime|mtime updated
  */
 int
 zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag,
     offset_t offset, cred_t *cr)
 {
 	(void) offset;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	uint64_t	off, len;
 	int		error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if (cmd != F_FREESP) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	if (bfp->l_len < 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Permissions aren't checked on Solaris because on this OS
 	 * zfs_space() can only be called with an opened file handle.
 	 * On Linux we can get here through truncate_range() which
 	 * operates directly on inodes, so we need to check access rights.
 	 */
 	if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, NULL))) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	off = bfp->l_start;
 	len = bfp->l_len; /* 0 means from off to end of file */
 
 	error = zfs_freesp(zp, off, len, flag, TRUE);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 static void
 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 {
 	(void) cr, (void) ct;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int error;
 
 	ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
 	if (zp->z_sa_hdl == NULL) {
 		/*
 		 * The fs has been unmounted, or we did a
 		 * suspend/resume and this file no longer exists.
 		 */
 		ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_unlinked) {
 		/*
 		 * Fast path to recycle a vnode of a removed file.
 		 */
 		ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 		vrecycle(vp);
 		return;
 	}
 
 	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
 		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 
 		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 		zfs_sa_upgrade_txholds(tx, zp);
 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 		} else {
 			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
 			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
 			zp->z_atime_dirty = 0;
 			dmu_tx_commit(tx);
 		}
 	}
 	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 }
 
 
 _Static_assert(sizeof (struct zfid_short) <= sizeof (struct fid),
 	"struct zfid_short bigger than struct fid");
 _Static_assert(sizeof (struct zfid_long) <= sizeof (struct fid),
 	"struct zfid_long bigger than struct fid");
 
 static int
 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 {
 	(void) ct;
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	uint32_t	gen;
 	uint64_t	gen64;
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		size, i, error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
 	    &gen64, sizeof (uint64_t))) != 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	gen = (uint32_t)gen64;
 
 	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
 	fidp->fid_len = size;
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = size;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* Must have a non-zero generation number to distinguish from .zfs */
 	if (gen == 0)
 		gen = 1;
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	if (size == LONG_FID_LEN) {
 		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
 		zfid_long_t	*zlfid;
 
 		zlfid = (zfid_long_t *)fidp;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 		/* XXX - this should be the generation number for the objset */
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			zlfid->zf_setgen[i] = 0;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 static int
 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
     caller_context_t *ct)
 {
 	znode_t *zp;
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	switch (cmd) {
 	case _PC_LINK_MAX:
 		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
 		return (0);
 
 	case _PC_FILESIZEBITS:
 		*valp = 64;
 		return (0);
 	case _PC_MIN_HOLE_SIZE:
 		*valp = (int)SPA_MINBLOCKSIZE;
 		return (0);
 	case _PC_ACL_EXTENDED:
 #if 0		/* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
 		*valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0;
 		zfs_exit(zfsvfs, FTAG);
 #else
 		*valp = 0;
 #endif
 		return (0);
 
 	case _PC_ACL_NFS4:
 		zp = VTOZ(vp);
 		zfsvfs = zp->z_zfsvfs;
 		if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 			return (error);
 		*valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0;
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 
 	case _PC_ACL_PATH_MAX:
 		*valp = ACL_MAX_ENTRIES;
 		return (0);
 
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 static int
 zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zfs_locked_range_t *lr;
 	vm_object_t object;
 	off_t start, end, obj_size;
 	uint_t blksz;
 	int pgsin_b, pgsin_a;
 	int error;
 
 	if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
 		return (zfs_vm_pagerret_error);
 
 	object = ma[0]->object;
 	start = IDX_TO_OFF(ma[0]->pindex);
 	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
 
 	/*
 	 * Lock a range covering all required and optional pages.
 	 * Note that we need to handle the case of the block size growing.
 	 */
 	for (;;) {
 		uint64_t len;
 
 		blksz = zp->z_blksz;
 		len = roundup(end, blksz) - rounddown(start, blksz);
 
 		lr = zfs_rangelock_tryenter(&zp->z_rangelock,
 		    rounddown(start, blksz), len, RL_READER);
 		if (lr == NULL) {
 			/*
 			 * Avoid a deadlock with update_pages().  We need to
 			 * hold the range lock when copying from the DMU, so
 			 * give up the busy lock to allow update_pages() to
 			 * proceed.  We might need to allocate new pages, which
 			 * isn't quite right since this allocation isn't subject
 			 * to the page fault handler's OOM logic, but this is
 			 * the best we can do for now.
 			 */
 			for (int i = 0; i < count; i++)
 				vm_page_xunbusy(ma[i]);
 
 			lr = zfs_rangelock_enter(&zp->z_rangelock,
 			    rounddown(start, blksz), len, RL_READER);
 
 			zfs_vmobject_wlock(object);
 			(void) vm_page_grab_pages(object, OFF_TO_IDX(start),
 			    VM_ALLOC_NORMAL | VM_ALLOC_WAITOK | VM_ALLOC_ZERO,
 			    ma, count);
 			zfs_vmobject_wunlock(object);
 		}
 		if (blksz == zp->z_blksz)
 			break;
 		zfs_rangelock_exit(lr);
 	}
 
 	zfs_vmobject_wlock(object);
 	obj_size = object->un_pager.vnp.vnp_size;
 	zfs_vmobject_wunlock(object);
 	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
 		zfs_rangelock_exit(lr);
 		zfs_exit(zfsvfs, FTAG);
 		return (zfs_vm_pagerret_bad);
 	}
 
 	pgsin_b = 0;
 	if (rbehind != NULL) {
 		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
 		pgsin_b = MIN(*rbehind, pgsin_b);
 	}
 
 	pgsin_a = 0;
 	if (rahead != NULL) {
 		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
 		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
 			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
 		pgsin_a = MIN(*rahead, pgsin_a);
 	}
 
 	/*
 	 * NB: we need to pass the exact byte size of the data that we expect
 	 * to read after accounting for the file size.  This is required because
 	 * ZFS will panic if we request DMU to read beyond the end of the last
 	 * allocated block.
 	 */
 	for (int i = 0; i < count; i++) {
 		int dummypgsin, count1, j, last_size;
 
 		if (vm_page_any_valid(ma[i])) {
 			ASSERT(vm_page_all_valid(ma[i]));
 			continue;
 		}
 		for (j = i + 1; j < count; j++) {
 			if (vm_page_any_valid(ma[j])) {
 				ASSERT(vm_page_all_valid(ma[j]));
 				break;
 			}
 		}
 		count1 = j - i;
 		dummypgsin = 0;
 		last_size = j == count ?
 		    MIN(end, obj_size) - (end - PAGE_SIZE) : PAGE_SIZE;
 		error = dmu_read_pages(zfsvfs->z_os, zp->z_id, &ma[i], count1,
 		    i == 0 ? &pgsin_b : &dummypgsin,
 		    j == count ? &pgsin_a : &dummypgsin,
 		    last_size);
 		if (error != 0)
 			break;
 		i += count1 - 1;
 	}
 
 	zfs_rangelock_exit(lr);
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 
 	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, count*PAGE_SIZE);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	if (error != 0)
 		return (zfs_vm_pagerret_error);
 
 	VM_CNT_INC(v_vnodein);
 	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
 	if (rbehind != NULL)
 		*rbehind = pgsin_b;
 	if (rahead != NULL)
 		*rahead = pgsin_a;
 	return (zfs_vm_pagerret_ok);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int *a_rbehind;
 	int *a_rahead;
 };
 #endif
 
 static int
 zfs_freebsd_getpages(struct vop_getpages_args *ap)
 {
 
 	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead));
 }
 
 typedef struct {
 	uint_t		pca_npages;
 	vm_page_t	pca_pages[];
 } putpage_commit_arg_t;
 
 static void
 zfs_putpage_commit_cb(void *arg, int err)
 {
 	putpage_commit_arg_t *pca = arg;
 	vm_object_t object = pca->pca_pages[0]->object;
 
 	zfs_vmobject_wlock(object);
 
 	for (uint_t i = 0; i < pca->pca_npages; i++) {
 		vm_page_t pp = pca->pca_pages[i];
 
 		if (err == 0) {
 			/*
 			 * Writeback succeeded, so undirty the page. If it
 			 * fails, we leave it in the same state it was. That's
 			 * most likely dirty, so it will get tried again some
 			 * other time.
 			 */
 			vm_page_undirty(pp);
 		}
 
 		vm_page_sunbusy(pp);
 	}
 
 	vm_object_pip_wakeupn(object, pca->pca_npages);
 
 	zfs_vmobject_wunlock(object);
 
 	kmem_free(pca,
 	    offsetof(putpage_commit_arg_t, pca_pages[pca->pca_npages]));
 }
 
 static int
 zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
     int *rtvals)
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	zfs_locked_range_t		*lr;
 	dmu_tx_t	*tx;
 	struct sf_buf	*sf;
 	vm_object_t	object;
 	vm_page_t	m;
 	caddr_t		va;
 	size_t		tocopy;
 	size_t		lo_len;
 	vm_ooffset_t	lo_off;
 	vm_ooffset_t	off;
 	uint_t		blksz;
 	int		ncount;
 	int		pcount;
 	int		err;
 	int		i;
 
 	object = vp->v_object;
 	KASSERT(ma[0]->object == object, ("mismatching object"));
 	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
 
 	pcount = btoc(len);
 	ncount = pcount;
 	for (i = 0; i < pcount; i++)
 		rtvals[i] = zfs_vm_pagerret_error;
 
 	if (zfs_enter_verify_zp(zfsvfs, zp, FTAG) != 0)
 		return (zfs_vm_pagerret_error);
 
 	off = IDX_TO_OFF(ma[0]->pindex);
 	blksz = zp->z_blksz;
 	lo_off = rounddown(off, blksz);
 	lo_len = roundup(len + (off - lo_off), blksz);
 	lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
 
 	zfs_vmobject_wlock(object);
 	if (len + off > object->un_pager.vnp.vnp_size) {
 		if (object->un_pager.vnp.vnp_size > off) {
 			int pgoff;
 
 			len = object->un_pager.vnp.vnp_size - off;
 			ncount = btoc(len);
 			if ((pgoff = (int)len & PAGE_MASK) != 0) {
 				/*
 				 * If the object is locked and the following
 				 * conditions hold, then the page's dirty
 				 * field cannot be concurrently changed by a
 				 * pmap operation.
 				 */
 				m = ma[ncount - 1];
 				vm_page_assert_sbusied(m);
 				KASSERT(!pmap_page_is_write_mapped(m),
 				    ("zfs_putpages: page %p is not read-only",
 				    m));
 				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
 				    pgoff);
 			}
 		} else {
 			len = 0;
 			ncount = 0;
 		}
 		if (ncount < pcount) {
 			for (i = ncount; i < pcount; i++) {
 				rtvals[i] = zfs_vm_pagerret_bad;
 			}
 		}
 	}
 	zfs_vmobject_wunlock(object);
 
 	boolean_t commit = (flags & (zfs_vm_pagerput_sync |
 	    zfs_vm_pagerput_inval)) != 0 ||
 	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS;
 
 	if (ncount == 0)
 		goto out;
 
 	if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) ||
 	    zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) ||
 	    (zp->z_projid != ZFS_DEFAULT_PROJID &&
 	    zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT,
 	    zp->z_projid))) {
 		goto out;
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 	zfs_sa_upgrade_txholds(tx, zp);
 	err = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (err != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 
 	if (zp->z_blksz < PAGE_SIZE) {
 		vm_ooffset_t woff = off;
 		size_t wlen = len;
 		for (i = 0; wlen > 0; woff += tocopy, wlen -= tocopy, i++) {
 			tocopy = MIN(PAGE_SIZE, wlen);
 			va = zfs_map_page(ma[i], &sf);
 			dmu_write(zfsvfs->z_os, zp->z_id, woff, tocopy, va, tx);
 			zfs_unmap_page(sf);
 		}
 	} else {
 		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
 	}
 
 	if (err == 0) {
 		uint64_t mtime[2], ctime[2];
 		sa_bulk_attr_t bulk[3];
 		int count = 0;
 
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 		    &mtime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    &ctime, 16);
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 		    &zp->z_pflags, 8);
 		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
 		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 		ASSERT0(err);
 
 		if (commit) {
 			/*
 			 * Caller requested that we commit immediately. We set
 			 * a callback on the log entry, to be called once its
 			 * on disk after the call to zil_commit() below. The
 			 * pages will be undirtied and unbusied there.
 			 */
 			putpage_commit_arg_t *pca = kmem_alloc(
 			    offsetof(putpage_commit_arg_t, pca_pages[ncount]),
 			    KM_SLEEP);
 			pca->pca_npages = ncount;
 			memcpy(pca->pca_pages, ma, sizeof (vm_page_t) * ncount);
 
 			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
 			    B_TRUE, B_FALSE, zfs_putpage_commit_cb, pca);
 
 			for (i = 0; i < ncount; i++)
 				rtvals[i] = zfs_vm_pagerret_pend;
 		} else {
 			/*
 			 * Caller just wants the page written back somewhere,
 			 * but doesn't need it committed yet. We've already
 			 * written it back to the DMU, so we just need to put
 			 * it on the async log, then undirty the page and
 			 * return.
 			 *
 			 * We cannot use a callback here, because it would keep
 			 * the page busy (locked) until it is eventually
 			 * written down at txg sync.
 			 */
 			zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len,
 			    B_FALSE, B_FALSE, NULL, NULL);
 
 			zfs_vmobject_wlock(object);
 			for (i = 0; i < ncount; i++) {
 				rtvals[i] = zfs_vm_pagerret_ok;
 				vm_page_undirty(ma[i]);
 			}
 			zfs_vmobject_wunlock(object);
 		}
 
 		VM_CNT_INC(v_vnodeout);
 		VM_CNT_ADD(v_vnodepgsout, ncount);
 	}
 	dmu_tx_commit(tx);
 
 out:
 	zfs_rangelock_exit(lr);
 	if (commit) {
 		err = zil_commit(zfsvfs->z_log, zp->z_id);
 		if (err != 0) {
 			zfs_exit(zfsvfs, FTAG);
 			return (err);
 		}
 	}
 
 	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, len);
 
 	zfs_exit(zfsvfs, FTAG);
 	return (rtvals[0]);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_putpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int a_sync;
 	int *a_rtvals;
 };
 #endif
 
 static int
 zfs_freebsd_putpages(struct vop_putpages_args *ap)
 {
 
 	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
 	    ap->a_rtvals));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_bmap_args {
 	struct vnode *a_vp;
 	daddr_t  a_bn;
 	struct bufobj **a_bop;
 	daddr_t *a_bnp;
 	int *a_runp;
 	int *a_runb;
 };
 #endif
 
 static int
 zfs_freebsd_bmap(struct vop_bmap_args *ap)
 {
 
 	if (ap->a_bop != NULL)
 		*ap->a_bop = &ap->a_vp->v_bufobj;
 	if (ap->a_bnp != NULL)
 		*ap->a_bnp = ap->a_bn;
 	if (ap->a_runp != NULL)
 		*ap->a_runp = 0;
 	if (ap->a_runb != NULL)
 		*ap->a_runb = 0;
 
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_open_args {
 	struct vnode *a_vp;
 	int a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_open(struct vop_open_args *ap)
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	int error;
 
 	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
 	if (error == 0)
 		vnode_create_vobject(vp, zp->z_size, ap->a_td);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_close_args {
 	struct vnode *a_vp;
 	int  a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_close(struct vop_close_args *ap)
 {
 
 	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_ioctl_args {
 	struct vnode *a_vp;
 	ulong_t a_command;
 	caddr_t a_data;
 	int a_fflag;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_ioctl(struct vop_ioctl_args *ap)
 {
 
 	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
 	    ap->a_fflag, ap->a_cred, NULL));
 }
 
 static int
 ioflags(int ioflags)
 {
 	int flags = 0;
 
 	if (ioflags & IO_APPEND)
 		flags |= O_APPEND;
 	if (ioflags & IO_NDELAY)
 		flags |= O_NONBLOCK;
 	if (ioflags & IO_DIRECT)
 		flags |= O_DIRECT;
 	if (ioflags & IO_SYNC)
 		flags |= O_SYNC;
 
 	return (flags);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_read_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_read(struct vop_read_args *ap)
 {
 	zfs_uio_t uio;
 	int error = 0;
 	zfs_uio_init(&uio, ap->a_uio);
 	error = zfs_read(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
 	    ap->a_cred);
 	/*
 	 * XXX We occasionally get an EFAULT for Direct I/O reads on
 	 * FreeBSD 13. This still needs to be resolved. The EFAULT comes
 	 * from:
 	 * zfs_uio_get__dio_pages_alloc() ->
 	 * zfs_uio_get_dio_pages_impl() ->
 	 * zfs_uio_iov_step() ->
 	 * zfs_uio_get_user_pages().
 	 * We return EFAULT from zfs_uio_iov_step(). When a Direct I/O
 	 * read fails to map in the user pages (returning EFAULT) the
 	 * Direct I/O request is broken up into two separate IO requests
 	 * and issued separately using Direct I/O.
 	 */
 #ifdef ZFS_DEBUG
 	if (error == EFAULT && uio.uio_extflg & UIO_DIRECT) {
 #if 0
 		printf("%s(%d): Direct I/O read returning EFAULT "
 		    "uio = %p, zfs_uio_offset(uio) = %lu "
 		    "zfs_uio_resid(uio) = %lu\n",
 		    __FUNCTION__, __LINE__, &uio, zfs_uio_offset(&uio),
 		    zfs_uio_resid(&uio));
 #endif
 	}
 
 #endif
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_write_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int a_ioflag;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_write(struct vop_write_args *ap)
 {
 	zfs_uio_t uio;
 	zfs_uio_init(&uio, ap->a_uio);
 	return (zfs_write(VTOZ(ap->a_vp), &uio, ioflags(ap->a_ioflag),
 	    ap->a_cred));
 }
 
 /*
  * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see
  * the comment above cache_fplookup for details.
  */
 static int
 zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v)
 {
 	vnode_t *vp;
 	znode_t *zp;
 	uint64_t pflags;
 
 	vp = v->a_vp;
 	zp = VTOZ_SMR(vp);
 	if (__predict_false(zp == NULL))
 		return (EAGAIN);
 	pflags = atomic_load_64(&zp->z_pflags);
 	if (pflags & ZFS_AV_QUARANTINED)
 		return (EAGAIN);
 	if (pflags & ZFS_XATTR)
 		return (EAGAIN);
 	if ((pflags & ZFS_NO_EXECS_DENIED) == 0)
 		return (EAGAIN);
 	return (0);
 }
 
 static int
 zfs_freebsd_fplookup_symlink(struct vop_fplookup_symlink_args *v)
 {
 	vnode_t *vp;
 	znode_t *zp;
 	char *target;
 
 	vp = v->a_vp;
 	zp = VTOZ_SMR(vp);
 	if (__predict_false(zp == NULL)) {
 		return (EAGAIN);
 	}
 
 	target = atomic_load_consume_ptr(&zp->z_cached_symlink);
 	if (target == NULL) {
 		return (EAGAIN);
 	}
 	return (cache_symlink_resolve(v->a_fpl, target, strlen(target)));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_access_args {
 	struct vnode *a_vp;
 	accmode_t a_accmode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_access(struct vop_access_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t *zp = VTOZ(vp);
 	accmode_t accmode;
 	int error = 0;
 
 
 	if (ap->a_accmode == VEXEC) {
 		if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0)
 			return (0);
 	}
 
 	/*
 	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
 	 */
 	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
 	if (accmode != 0) {
 #if __FreeBSD_version >= 1500040
 		/* For named attributes, do the checks. */
 		if ((vn_irflag_read(vp) & VIRF_NAMEDATTR) != 0)
 			error = zfs_access(zp, accmode, V_NAMEDATTR,
 			    ap->a_cred);
 		else
 #endif
 			error = zfs_access(zp, accmode, 0, ap->a_cred);
 	}
 
 	/*
 	 * VADMIN has to be handled by vaccess().
 	 */
 	if (error == 0) {
 		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
 		if (accmode != 0) {
 			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
 			    zp->z_gid, accmode, ap->a_cred);
 		}
 	}
 
 	/*
 	 * For VEXEC, ensure that at least one execute bit is set for
 	 * non-directories.
 	 */
 	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
 	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
 		error = EACCES;
 	}
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_lookup_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 #if __FreeBSD_version >= 1500040
 static int
 zfs_lookup_nameddir(struct vnode *dvp, struct componentname *cnp,
     struct vnode **vpp)
 {
 	struct vnode *xvp;
 	int error, flags;
 
 	*vpp = NULL;
 	flags = LOOKUP_XATTR | LOOKUP_NAMED_ATTR;
 	if ((cnp->cn_flags & CREATENAMED) != 0)
 		flags |= CREATE_XATTR_DIR;
 	error = zfs_lookup(dvp, NULL, &xvp, NULL, 0, cnp->cn_cred, flags,
 	    B_FALSE);
 	if (error == 0) {
 		if ((cnp->cn_flags & LOCKLEAF) != 0)
 			error = vn_lock(xvp, cnp->cn_lkflags);
 		if (error == 0) {
 			vn_irflag_set_cond(xvp, VIRF_NAMEDDIR);
 			*vpp = xvp;
 		} else {
 			vrele(xvp);
 		}
 	}
 	return (error);
 }
 
 static ssize_t
 zfs_readdir_named(struct vnode *vp, char *buf, ssize_t blen, off_t *offp,
     int *eofflagp, struct ucred *cred, struct thread *td)
 {
 	struct uio io;
 	struct iovec iv;
 	zfs_uio_t uio;
 	int error;
 
 	io.uio_offset = *offp;
 	io.uio_segflg = UIO_SYSSPACE;
 	io.uio_rw = UIO_READ;
 	io.uio_td = td;
 	iv.iov_base = buf;
 	iv.iov_len = blen;
 	io.uio_iov = &iv;
 	io.uio_iovcnt = 1;
 	io.uio_resid = blen;
 	zfs_uio_init(&uio, &io);
 	error = zfs_readdir(vp, &uio, cred, eofflagp, NULL, NULL);
 	if (error != 0)
 		return (-1);
 	*offp = io.uio_offset;
 	return (blen - io.uio_resid);
 }
 
 static bool
 zfs_has_namedattr(struct vnode *vp, struct ucred *cred)
 {
 	struct componentname cn;
 	struct vnode *xvp;
 	struct dirent *dp;
 	off_t offs;
 	ssize_t rsize;
 	char *buf, *cp, *endcp;
 	int eofflag, error;
 	bool ret;
 
 	MNT_ILOCK(vp->v_mount);
 	if ((vp->v_mount->mnt_flag & MNT_NAMEDATTR) == 0) {
 		MNT_IUNLOCK(vp->v_mount);
 		return (false);
 	}
 	MNT_IUNLOCK(vp->v_mount);
 
 	/* Now see if a named attribute directory exists. */
 	cn.cn_flags = LOCKLEAF;
 	cn.cn_lkflags = LK_SHARED;
 	cn.cn_cred = cred;
 	error = zfs_lookup_nameddir(vp, &cn, &xvp);
 	if (error != 0)
 		return (false);
 
 	/* It exists, so see if there is any entry other than "." and "..". */
 	buf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
 	ret = false;
 	offs = 0;
 	do {
 		rsize = zfs_readdir_named(xvp, buf, DEV_BSIZE, &offs, &eofflag,
 		    cred, curthread);
 		if (rsize <= 0)
 			break;
 		cp = buf;
 		endcp = &buf[rsize];
 		while (cp < endcp) {
 			dp = (struct dirent *)cp;
 			if (dp->d_fileno != 0 && (dp->d_type == DT_REG ||
 			    dp->d_type == DT_UNKNOWN) &&
 			    !ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name) &&
 			    ((dp->d_namlen == 1 && dp->d_name[0] != '.') ||
 			    (dp->d_namlen == 2 && (dp->d_name[0] != '.' ||
 			    dp->d_name[1] != '.')) || dp->d_namlen > 2)) {
 				ret = true;
 				break;
 			}
 			cp += dp->d_reclen;
 		}
 	} while (!ret && rsize > 0 && eofflag == 0);
 	vput(xvp);
 	free(buf, M_TEMP);
 	return (ret);
 }
 
 static int
 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 	int error;
 	struct vnode **vpp = ap->a_vpp, *dvp = ap->a_dvp, *xvp;
 	bool is_nameddir, needs_nameddir, opennamed = false;
 
 	/*
 	 * These variables are used to handle the named attribute cases:
 	 * opennamed - Is true when this is a call from open with O_NAMEDATTR
 	 *    specified and it is the last component.
 	 * is_nameddir - Is true when the directory is a named attribute dir.
 	 * needs_nameddir - Is set when the lookup needs to look for/create
 	 *    a named attribute directory.  It is only set when is_nameddir
 	 *    is_nameddir is false and opennamed is true.
 	 * xvp - Is the directory that the lookup needs to be done in.
 	 *    Usually dvp, unless needs_nameddir is true where it is the
 	 *    result of the first non-named directory lookup.
 	 * Note that name caching must be disabled for named attribute
 	 * handling.
 	 */
 	needs_nameddir = false;
 	xvp = dvp;
 	opennamed = (cnp->cn_flags & (OPENNAMED | ISLASTCN)) ==
 	    (OPENNAMED | ISLASTCN);
 	is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
 	if (is_nameddir && (cnp->cn_flags & ISLASTCN) == 0)
 		return (ENOATTR);
 	if (opennamed && !is_nameddir && (cnp->cn_flags & ISDOTDOT) != 0)
 		return (ENOATTR);
 	if (opennamed || is_nameddir)
 		cnp->cn_flags &= ~MAKEENTRY;
 	if (opennamed && !is_nameddir)
 		needs_nameddir = true;
 	ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
 	error = 0;
 	*vpp = NULL;
 	if (needs_nameddir) {
 		if (VOP_ISLOCKED(dvp) != LK_EXCLUSIVE)
 			vn_lock(dvp, LK_UPGRADE | LK_RETRY);
 		error = zfs_lookup_nameddir(dvp, cnp, &xvp);
 		if (error == 0)
 			is_nameddir = true;
 	}
 	if (error == 0) {
 		if (!needs_nameddir || cnp->cn_namelen != 1 ||
 		    *cnp->cn_nameptr != '.') {
 			strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1,
 			    sizeof (nm)));
 			error = zfs_lookup(xvp, nm, vpp, cnp, cnp->cn_nameiop,
 			    cnp->cn_cred, 0, cached);
 			if (is_nameddir && error == 0 &&
 			    (cnp->cn_namelen != 1 || *cnp->cn_nameptr != '.') &&
 			    (cnp->cn_flags & ISDOTDOT) == 0) {
 				if ((*vpp)->v_type == VDIR)
 					vn_irflag_set_cond(*vpp, VIRF_NAMEDDIR);
 				else
 					vn_irflag_set_cond(*vpp,
 					    VIRF_NAMEDATTR);
 			}
 			if (needs_nameddir && xvp != *vpp)
 				vput(xvp);
 		} else {
 			/*
 			 * Lookup of "." when a named attribute dir is needed.
 			 */
 			*vpp = xvp;
 		}
 	}
 	return (error);
 }
 #else
 static int
 zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
 {
 	struct componentname *cnp = ap->a_cnp;
 	char nm[NAME_MAX + 1];
 
 	ASSERT3U(cnp->cn_namelen, <, sizeof (nm));
 	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm)));
 
 	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
 	    cnp->cn_cred, 0, cached));
 }
 #endif
 
 static int
 zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
 {
 
 	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_lookup_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	zfsvfs_t *zfsvfs;
 
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 #if __FreeBSD_version >= 1500040
 	if (zfsvfs->z_use_namecache && (ap->a_cnp->cn_flags & OPENNAMED) == 0)
 #else
 	if (zfsvfs->z_use_namecache)
 #endif
 		return (vfs_cache_lookup(ap));
 	else
 		return (zfs_freebsd_lookup(ap, B_FALSE));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 static int
 zfs_freebsd_create(struct vop_create_args *ap)
 {
 	zfsvfs_t *zfsvfs;
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	int rc, mode;
 	struct vnode *dvp = ap->a_dvp;
 #if __FreeBSD_version >= 1500040
 	struct vnode *xvp;
 	bool is_nameddir;
 #endif
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	vattr_init_mask(vap);
 	mode = vap->va_mode & ALLPERMS;
 	zfsvfs = ap->a_dvp->v_mount->mnt_data;
 	*ap->a_vpp = NULL;
 
 	rc = 0;
 #if __FreeBSD_version >= 1500040
 	xvp = NULL;
 	is_nameddir = (vn_irflag_read(dvp) & VIRF_NAMEDDIR) != 0;
 	if (!is_nameddir && (cnp->cn_flags & OPENNAMED) != 0) {
 		/* Needs a named attribute directory. */
 		rc = zfs_lookup_nameddir(dvp, cnp, &xvp);
 		if (rc == 0) {
 			dvp = xvp;
 			is_nameddir = true;
 		}
 	}
 	if (is_nameddir && rc == 0)
 		rc = zfs_check_attrname(cnp->cn_nameptr);
 #endif
 
 	if (rc == 0)
 		rc = zfs_create(VTOZ(dvp), cnp->cn_nameptr, vap, 0, mode,
 		    &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */, NULL);
 #if __FreeBSD_version >= 1500040
 	if (xvp != NULL)
 		vput(xvp);
 #endif
 	if (rc == 0) {
 		*ap->a_vpp = ZTOV(zp);
 #if __FreeBSD_version >= 1500040
 		if (is_nameddir)
 			vn_irflag_set_cond(*ap->a_vpp, VIRF_NAMEDATTR);
 #endif
 	}
 	if (zfsvfs->z_use_namecache &&
 	    rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
 
 	return (rc);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_remove(struct vop_remove_args *ap)
 {
 	int error = 0;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 #endif
 
 #if __FreeBSD_version >= 1500040
 	if ((vn_irflag_read(ap->a_dvp) & VIRF_NAMEDDIR) != 0)
 		error = zfs_check_attrname(ap->a_cnp->cn_nameptr);
 #endif
 
 	if (error == 0)
 		error = zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
 		    ap->a_cnp->cn_cred);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_mkdir_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 };
 #endif
 
 static int
 zfs_freebsd_mkdir(struct vop_mkdir_args *ap)
 {
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	int rc;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
 #endif
 
 	vattr_init_mask(vap);
 	*ap->a_vpp = NULL;
 
 	rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp,
 	    ap->a_cnp->cn_cred, 0, NULL, NULL);
 
 	if (rc == 0)
 		*ap->a_vpp = ZTOV(zp);
 	return (rc);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_rmdir_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_rmdir(struct vop_rmdir_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_readdir_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	cookie_t **a_cookies;
 };
 #endif
 
 static int
 zfs_freebsd_readdir(struct vop_readdir_args *ap)
 {
 	zfs_uio_t uio;
 	zfs_uio_init(&uio, ap->a_uio);
 	return (zfs_readdir(ap->a_vp, &uio, ap->a_cred, ap->a_eofflag,
 	    ap->a_ncookies, ap->a_cookies));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_fsync_args {
 	struct vnode *a_vp;
 	int a_waitfor;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_fsync(struct vop_fsync_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	int err = 0;
 
 	/*
 	 * Push any dirty mmap()'d data out to the DMU and ZIL, ready for
 	 * zil_commit() to be called in zfs_fsync().
 	 */
 	if (vm_object_mightbedirty(vp->v_object)) {
 		zfs_vmobject_wlock(vp->v_object);
 		if (!vm_object_page_clean(vp->v_object, 0, 0, 0))
 			err = SET_ERROR(EIO);
 		zfs_vmobject_wunlock(vp->v_object);
 		if (err) {
 			/*
 			 * Unclear what state things are in. zfs_putpages()
 			 * will ensure the pages remain dirty if they haven't
 			 * been written down to the DMU, but because there may
 			 * be nothing logged, we can't assume that zfs_sync()
 			 * -> zil_commit() will give us a useful error. It's
 			 *  safest if we just error out here.
 			 */
 			return (err);
 		}
 	}
 
 	return (zfs_fsync(VTOZ(vp), 0, ap->a_td->td_ucred));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_getattr(struct vop_getattr_args *ap)
 {
 	vattr_t *vap = ap->a_vap;
 	xvattr_t xvap;
 	ulong_t fflags = 0;
 	int error;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 	xvap.xva_vattr.va_mask |= AT_XVATTR;
 
 	/* Convert chflags into ZFS-type flags. */
 	/* XXX: what about SF_SETTABLE?. */
 	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
 	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
 	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
 	XVA_SET_REQ(&xvap, XAT_NODUMP);
 	XVA_SET_REQ(&xvap, XAT_READONLY);
 	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
 	XVA_SET_REQ(&xvap, XAT_SYSTEM);
 	XVA_SET_REQ(&xvap, XAT_HIDDEN);
 	XVA_SET_REQ(&xvap, XAT_REPARSE);
 	XVA_SET_REQ(&xvap, XAT_OFFLINE);
 	XVA_SET_REQ(&xvap, XAT_SPARSE);
 
 	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred);
 	if (error != 0)
 		return (error);
 
 	/* Convert ZFS xattr into chflags. */
 #define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
 	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
 		fflags |= (fflag);					\
 } while (0)
 	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
 	    xvap.xva_xoptattrs.xoa_immutable);
 	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
 	    xvap.xva_xoptattrs.xoa_appendonly);
 	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
 	    xvap.xva_xoptattrs.xoa_nounlink);
 	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
 	    xvap.xva_xoptattrs.xoa_archive);
 	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
 	    xvap.xva_xoptattrs.xoa_nodump);
 	FLAG_CHECK(UF_READONLY, XAT_READONLY,
 	    xvap.xva_xoptattrs.xoa_readonly);
 	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
 	    xvap.xva_xoptattrs.xoa_system);
 	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
 	    xvap.xva_xoptattrs.xoa_hidden);
 	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
 	    xvap.xva_xoptattrs.xoa_reparse);
 	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
 	    xvap.xva_xoptattrs.xoa_offline);
 	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
 	    xvap.xva_xoptattrs.xoa_sparse);
 
 #undef	FLAG_CHECK
 	*vap = xvap.xva_vattr;
 	vap->va_flags = fflags;
 
 #if __FreeBSD_version >= 1500040
 	if ((vn_irflag_read(ap->a_vp) & (VIRF_NAMEDDIR | VIRF_NAMEDATTR)) != 0)
 		vap->va_bsdflags |= SFBSD_NAMEDATTR;
 #endif
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_setattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_setattr(struct vop_setattr_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	vattr_t *vap = ap->a_vap;
 	cred_t *cred = ap->a_cred;
 	xvattr_t xvap;
 	ulong_t fflags;
 	uint64_t zflags;
 
 	vattr_init_mask(vap);
 	vap->va_mask &= ~AT_NOSET;
 
 	xva_init(&xvap);
 	xvap.xva_vattr = *vap;
 
 	zflags = VTOZ(vp)->z_pflags;
 
 	if (vap->va_flags != VNOVAL) {
 		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
 		int error;
 
 		if (zfsvfs->z_use_fuids == B_FALSE)
 			return (EOPNOTSUPP);
 
 		fflags = vap->va_flags;
 		/*
 		 * XXX KDM
 		 * We need to figure out whether it makes sense to allow
 		 * UF_REPARSE through, since we don't really have other
 		 * facilities to handle reparse points and zfs_setattr()
 		 * doesn't currently allow setting that attribute anyway.
 		 */
 		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
 		    UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
 		    UF_OFFLINE|UF_SPARSE)) != 0)
 			return (EOPNOTSUPP);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
 		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) {
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
 				error = securelevel_gt(cred, 0);
 				if (error != 0)
 					return (error);
 			}
 		} else {
 			/*
 			 * Callers may only modify the file flags on
 			 * objects they have VADMIN rights for.
 			 */
 			if ((error = VOP_ACCESS(vp, VADMIN, cred,
 			    curthread)) != 0)
 				return (error);
 			if (zflags &
 			    (ZFS_IMMUTABLE | ZFS_APPENDONLY |
 			    ZFS_NOUNLINK)) {
 				return (EPERM);
 			}
 			if (fflags &
 			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
 				return (EPERM);
 			}
 		}
 
 #define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
 	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
 	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
 		XVA_SET_REQ(&xvap, (xflag));				\
 		(xfield) = ((fflags & (fflag)) != 0);			\
 	}								\
 } while (0)
 		/* Convert chflags into ZFS-type flags. */
 		/* XXX: what about SF_SETTABLE?. */
 		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
 		    xvap.xva_xoptattrs.xoa_immutable);
 		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
 		    xvap.xva_xoptattrs.xoa_appendonly);
 		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
 		    xvap.xva_xoptattrs.xoa_nounlink);
 		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
 		    xvap.xva_xoptattrs.xoa_archive);
 		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
 		    xvap.xva_xoptattrs.xoa_nodump);
 		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
 		    xvap.xva_xoptattrs.xoa_readonly);
 		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
 		    xvap.xva_xoptattrs.xoa_system);
 		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
 		    xvap.xva_xoptattrs.xoa_hidden);
 		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
 		    xvap.xva_xoptattrs.xoa_reparse);
 		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
 		    xvap.xva_xoptattrs.xoa_offline);
 		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
 		    xvap.xva_xoptattrs.xoa_sparse);
 #undef	FLAG_CHANGE
 	}
 	if (vap->va_birthtime.tv_sec != VNOVAL) {
 		xvap.xva_vattr.va_mask |= AT_XVATTR;
 		XVA_SET_REQ(&xvap, XAT_CREATETIME);
 	}
 	return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred, NULL));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_rename_args {
 	struct vnode *a_fdvp;
 	struct vnode *a_fvp;
 	struct componentname *a_fcnp;
 	struct vnode *a_tdvp;
 	struct vnode *a_tvp;
 	struct componentname *a_tcnp;
 };
 #endif
 
 static int
 zfs_freebsd_rename(struct vop_rename_args *ap)
 {
 	vnode_t *fdvp = ap->a_fdvp;
 	vnode_t *fvp = ap->a_fvp;
 	vnode_t *tdvp = ap->a_tdvp;
 	vnode_t *tvp = ap->a_tvp;
 	int error = 0;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
 	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
 #endif
 
 #if __FreeBSD_version >= 1500040
 	if ((vn_irflag_read(fdvp) & VIRF_NAMEDDIR) != 0) {
 		error = zfs_check_attrname(ap->a_fcnp->cn_nameptr);
 		if (error == 0)
 			error = zfs_check_attrname(ap->a_tcnp->cn_nameptr);
 	}
 #endif
 
 	if (error == 0)
 		error = zfs_do_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
 		    ap->a_tcnp, ap->a_fcnp->cn_cred);
 
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp != NULL)
 		vrele(tvp);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_symlink_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 	char *a_target;
 };
 #endif
 
 static int
 zfs_freebsd_symlink(struct vop_symlink_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	vattr_t *vap = ap->a_vap;
 	znode_t *zp = NULL;
 	char *symlink;
 	size_t symlink_len;
 	int rc;
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
 	vattr_init_mask(vap);
 	*ap->a_vpp = NULL;
 
 	rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap,
 	    ap->a_target, &zp, cnp->cn_cred, 0 /* flags */, NULL);
 	if (rc == 0) {
 		*ap->a_vpp = ZTOV(zp);
 		ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
 		MPASS(zp->z_cached_symlink == NULL);
 		symlink_len = strlen(ap->a_target);
 		symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
 		if (symlink != NULL) {
 			memcpy(symlink, ap->a_target, symlink_len);
 			symlink[symlink_len] = '\0';
 			atomic_store_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
 			    (uintptr_t)symlink);
 		}
 	}
 	return (rc);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_readlink_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 };
 #endif
 
 static int
 zfs_freebsd_readlink(struct vop_readlink_args *ap)
 {
 	zfs_uio_t uio;
 	int error;
 	znode_t	*zp = VTOZ(ap->a_vp);
 	char *symlink, *base;
 	size_t symlink_len;
 	bool trycache;
 
 	zfs_uio_init(&uio, ap->a_uio);
 	trycache = false;
 	if (zfs_uio_segflg(&uio) == UIO_SYSSPACE &&
 	    zfs_uio_iovcnt(&uio) == 1) {
 		base = zfs_uio_iovbase(&uio, 0);
 		symlink_len = zfs_uio_iovlen(&uio, 0);
 		trycache = true;
 	}
 	error = zfs_readlink(ap->a_vp, &uio, ap->a_cred, NULL);
 	if (atomic_load_ptr(&zp->z_cached_symlink) != NULL ||
 	    error != 0 || !trycache) {
 		return (error);
 	}
 	symlink_len -= zfs_uio_resid(&uio);
 	symlink = cache_symlink_alloc(symlink_len + 1, M_WAITOK);
 	if (symlink != NULL) {
 		memcpy(symlink, base, symlink_len);
 		symlink[symlink_len] = '\0';
 		if (!atomic_cmpset_rel_ptr((uintptr_t *)&zp->z_cached_symlink,
 		    (uintptr_t)NULL, (uintptr_t)symlink)) {
 			cache_symlink_free(symlink, symlink_len + 1);
 		}
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_link_args {
 	struct vnode *a_tdvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
 };
 #endif
 
 static int
 zfs_freebsd_link(struct vop_link_args *ap)
 {
 	struct componentname *cnp = ap->a_cnp;
 	vnode_t *vp = ap->a_vp;
 	vnode_t *tdvp = ap->a_tdvp;
 
 	if (tdvp->v_mount != vp->v_mount)
 		return (EXDEV);
 
 #if __FreeBSD_version < 1400068
 	ASSERT(cnp->cn_flags & SAVENAME);
 #endif
 
 	return (zfs_link(VTOZ(tdvp), VTOZ(vp),
 	    cnp->cn_nameptr, cnp->cn_cred, 0));
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_inactive(struct vop_inactive_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 
 	zfs_inactive(vp, curthread->td_ucred, NULL);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_need_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	int need;
 
 	if (vn_need_pageq_flush(vp))
 		return (1);
 
 	if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs))
 		return (1);
 	need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty);
 	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 
 	return (need);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_reclaim_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
 };
 #endif
 
 static int
 zfs_freebsd_reclaim(struct vop_reclaim_args *ap)
 {
 	vnode_t	*vp = ap->a_vp;
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
 	ASSERT3P(zp, !=, NULL);
 
 	/*
 	 * z_teardown_inactive_lock protects from a race with
 	 * zfs_znode_dmu_fini in zfsvfs_teardown during
 	 * force unmount.
 	 */
 	ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs);
 	if (zp->z_sa_hdl == NULL)
 		zfs_znode_free(zp);
 	else
 		zfs_zinactive(zp);
 	ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs);
 
 	vp->v_data = NULL;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_fid_args {
 	struct vnode *a_vp;
 	struct fid *a_fid;
 };
 #endif
 
 static int
 zfs_freebsd_fid(struct vop_fid_args *ap)
 {
 
 	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_pathconf_args {
 	struct vnode *a_vp;
 	int a_name;
 	register_t *a_retval;
 } *ap;
 #endif
 
 static int
 zfs_freebsd_pathconf(struct vop_pathconf_args *ap)
 {
 	ulong_t val;
 	int error;
+#ifdef _PC_CLONE_BLKSIZE
+	zfsvfs_t *zfsvfs;
+#endif
 
 	error = zfs_pathconf(ap->a_vp, ap->a_name, &val,
 	    curthread->td_ucred, NULL);
 	if (error == 0) {
 		*ap->a_retval = val;
 		return (error);
 	}
 	if (error != EOPNOTSUPP)
 		return (error);
 
 	switch (ap->a_name) {
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 #if __FreeBSD_version >= 1400032
 	case _PC_DEALLOC_PRESENT:
 		*ap->a_retval = 1;
 		return (0);
 #endif
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
 			*ap->a_retval = PIPE_BUF;
 			return (0);
 		}
 		return (EINVAL);
 #if __FreeBSD_version >= 1500040
 	case _PC_NAMEDATTR_ENABLED:
 		MNT_ILOCK(ap->a_vp->v_mount);
 		if ((ap->a_vp->v_mount->mnt_flag & MNT_NAMEDATTR) != 0)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		MNT_IUNLOCK(ap->a_vp->v_mount);
 		return (0);
 	case _PC_HAS_NAMEDATTR:
 		if (zfs_has_namedattr(ap->a_vp, curthread->td_ucred))
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		return (0);
 #endif
 #ifdef _PC_HAS_HIDDENSYSTEM
 	case _PC_HAS_HIDDENSYSTEM:
 		*ap->a_retval = 1;
 		return (0);
+#endif
+#ifdef _PC_CLONE_BLKSIZE
+	case _PC_CLONE_BLKSIZE:
+		zfsvfs = (zfsvfs_t *)ap->a_vp->v_mount->mnt_data;
+		if (zfs_bclone_enabled &&
+		    spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+		    SPA_FEATURE_BLOCK_CLONING))
+			*ap->a_retval = dsl_dataset_feature_is_active(
+			    zfsvfs->z_os->os_dsl_dataset,
+			    SPA_FEATURE_LARGE_BLOCKS) ?
+			    SPA_MAXBLOCKSIZE :
+			    SPA_OLD_MAXBLOCKSIZE;
+		else
+			*ap->a_retval = 0;
+		return (0);
 #endif
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 int zfs_xattr_compat = 1;
 
 static int
 zfs_check_attrname(const char *name)
 {
 	/* We don't allow '/' character in attribute name. */
 	if (strchr(name, '/') != NULL)
 		return (SET_ERROR(EINVAL));
 	/* We don't allow attribute names that start with a namespace prefix. */
 	if (ZFS_XA_NS_PREFIX_FORBIDDEN(name))
 		return (SET_ERROR(EINVAL));
 	return (0);
 }
 
 /*
  * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
  * extended attribute name:
  *
  *	NAMESPACE	XATTR_COMPAT	PREFIX
  *	system		*		freebsd:system:
  *	user		1		(none, can be used to access ZFS
  *					fsattr(5) attributes created on Solaris)
  *	user		0		user.
  */
 static int
 zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
     size_t size, boolean_t compat)
 {
 	const char *namespace, *prefix, *suffix;
 
 	memset(attrname, 0, size);
 
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_USER:
 		if (compat) {
 			/*
 			 * This is the default namespace by which we can access
 			 * all attributes created on Solaris.
 			 */
 			prefix = namespace = suffix = "";
 		} else {
 			/*
 			 * This is compatible with the user namespace encoding
 			 * on Linux prior to xattr_compat, but nothing
 			 * else.
 			 */
 			prefix = "";
 			namespace = "user";
 			suffix = ".";
 		}
 		break;
 	case EXTATTR_NAMESPACE_SYSTEM:
 		prefix = "freebsd:";
 		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
 		suffix = ":";
 		break;
 	case EXTATTR_NAMESPACE_EMPTY:
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
 	    name) >= size) {
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 	return (0);
 }
 
 static int
 zfs_ensure_xattr_cached(znode_t *zp)
 {
 	int error = 0;
 
 	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
 
 	if (zp->z_xattr_cached != NULL)
 		return (0);
 
 	if (rw_write_held(&zp->z_xattr_lock))
 		return (zfs_sa_get_xattr(zp));
 
 	if (!rw_tryupgrade(&zp->z_xattr_lock)) {
 		rw_exit(&zp->z_xattr_lock);
 		rw_enter(&zp->z_xattr_lock, RW_WRITER);
 	}
 	if (zp->z_xattr_cached == NULL)
 		error = zfs_sa_get_xattr(zp);
 	rw_downgrade(&zp->z_xattr_lock);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_getextattr_dir(struct vop_getextattr_args *ap, const char *attrname)
 {
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	flags = FREAD;
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
 	    xvp, td);
 #else
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	vp = nd.ni_vp;
 	NDFREE_PNBUF(&nd);
 
 	if (ap->a_size != NULL) {
 		error = VOP_GETATTR(vp, &va, ap->a_cred);
 		if (error == 0)
 			*ap->a_size = (size_t)va.va_size;
 	} else if (ap->a_uio != NULL)
 		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp);
 	vn_close(vp, flags, ap->a_cred, td);
 	return (error);
 }
 
 static int
 zfs_getextattr_sa(struct vop_getextattr_args *ap, const char *attrname)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	uchar_t *nv_value;
 	uint_t nv_size;
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	error = nvlist_lookup_byte_array(zp->z_xattr_cached, attrname,
 	    &nv_value, &nv_size);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	if (ap->a_size != NULL)
 		*ap->a_size = nv_size;
 	else if (ap->a_uio != NULL)
 		error = uiomove(nv_value, nv_size, ap->a_uio);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	return (0);
 }
 
 static int
 zfs_getextattr_impl(struct vop_getextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrname[EXTATTR_MAXNAMELEN+1];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname), compat);
 	if (error != 0)
 		return (error);
 
 	error = ENOENT;
 	if (zfsvfs->z_use_sa && zp->z_is_sa)
 		error = zfs_getextattr_sa(ap, attrname);
 	if (error == ENOENT)
 		error = zfs_getextattr_dir(ap, attrname);
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve a named extended attribute.
  */
 static int
 zfs_getextattr(struct vop_getextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	error = zfs_check_attrname(ap->a_name);
 	if (error != 0)
 		return (error);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	error = ENOENT;
 	rw_enter(&zp->z_xattr_lock, RW_READER);
 
 	error = zfs_getextattr_impl(ap, zfs_xattr_compat);
 	if ((error == ENOENT || error == ENOATTR) &&
 	    ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/*
 		 * Fall back to the alternate namespace format if we failed to
 		 * find a user xattr.
 		 */
 		error = zfs_getextattr_impl(ap, !zfs_xattr_compat);
 	}
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	if (error == ENOENT)
 		error = SET_ERROR(ENOATTR);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_deleteextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_deleteextattr_dir(struct vop_deleteextattr_args *ap, const char *attrname)
 {
 	struct nameidata nd;
 	vnode_t *xvp = NULL, *vp;
 	int error;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR, B_FALSE);
 	if (error != 0)
 		return (error);
 
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp, ap->a_td);
 #else
 	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
 	    UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = namei(&nd);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	vp = nd.ni_vp;
 	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	NDFREE_PNBUF(&nd);
 
 	vput(nd.ni_dvp);
 	if (vp == nd.ni_dvp)
 		vrele(vp);
 	else
 		vput(vp);
 
 	return (error);
 }
 
 static int
 zfs_deleteextattr_sa(struct vop_deleteextattr_args *ap, const char *attrname)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	nvlist_t *nvl;
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	nvl = zp->z_xattr_cached;
 	error = nvlist_remove(nvl, attrname, DATA_TYPE_BYTE_ARRAY);
 	if (error != 0)
 		error = SET_ERROR(error);
 	else
 		error = zfs_sa_set_xattr(zp, attrname, NULL, 0);
 	if (error != 0) {
 		zp->z_xattr_cached = NULL;
 		nvlist_free(nvl);
 	}
 	return (error);
 }
 
 static int
 zfs_deleteextattr_impl(struct vop_deleteextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrname[EXTATTR_MAXNAMELEN+1];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname), compat);
 	if (error != 0)
 		return (error);
 
 	error = ENOENT;
 	if (zfsvfs->z_use_sa && zp->z_is_sa)
 		error = zfs_deleteextattr_sa(ap, attrname);
 	if (error == ENOENT)
 		error = zfs_deleteextattr_dir(ap, attrname);
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 static int
 zfs_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	error = zfs_check_attrname(ap->a_name);
 	if (error != 0)
 		return (error);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 
 	error = zfs_deleteextattr_impl(ap, zfs_xattr_compat);
 	if ((error == ENOENT || error == ENOATTR) &&
 	    ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/*
 		 * Fall back to the alternate namespace format if we failed to
 		 * find a user xattr.
 		 */
 		error = zfs_deleteextattr_impl(ap, !zfs_xattr_compat);
 	}
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	if (error == ENOENT)
 		error = SET_ERROR(ENOATTR);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_setextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	IN const char *a_name;
 	INOUT struct uio *a_uio;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_setextattr_dir(struct vop_setextattr_args *ap, const char *attrname)
 {
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	struct vattr va;
 	vnode_t *xvp = NULL, *vp;
 	int error, flags;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	flags = FFLAGS(O_WRONLY | O_CREAT);
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td);
 #else
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp);
 #endif
 	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
 	    NULL);
 	if (error != 0)
 		return (SET_ERROR(error));
 	vp = nd.ni_vp;
 	NDFREE_PNBUF(&nd);
 
 	VATTR_NULL(&va);
 	va.va_size = 0;
 	error = VOP_SETATTR(vp, &va, ap->a_cred);
 	if (error == 0)
 		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
 
 	VOP_UNLOCK(vp);
 	vn_close(vp, flags, ap->a_cred, td);
 	return (error);
 }
 
 static int
 zfs_setextattr_sa(struct vop_setextattr_args *ap, const char *attrname)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	nvlist_t *nvl;
 	size_t sa_size;
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	nvl = zp->z_xattr_cached;
 	size_t entry_size = ap->a_uio->uio_resid;
 	if (entry_size > DXATTR_MAX_ENTRY_SIZE)
 		return (SET_ERROR(EFBIG));
 	error = nvlist_size(nvl, &sa_size, NV_ENCODE_XDR);
 	if (error != 0)
 		return (SET_ERROR(error));
 	if (sa_size > DXATTR_MAX_SA_SIZE)
 		return (SET_ERROR(EFBIG));
 	uchar_t *buf = kmem_alloc(entry_size, KM_SLEEP);
 	error = uiomove(buf, entry_size, ap->a_uio);
 	if (error != 0) {
 		error = SET_ERROR(error);
 	} else {
 		error = nvlist_add_byte_array(nvl, attrname, buf, entry_size);
 		if (error != 0)
 			error = SET_ERROR(error);
 	}
 	if (error == 0)
 		error = zfs_sa_set_xattr(zp, attrname, buf, entry_size);
 	kmem_free(buf, entry_size);
 	if (error != 0) {
 		zp->z_xattr_cached = NULL;
 		nvlist_free(nvl);
 	}
 	return (error);
 }
 
 static int
 zfs_setextattr_impl(struct vop_setextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrname[EXTATTR_MAXNAMELEN+1];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
 	    sizeof (attrname), compat);
 	if (error != 0)
 		return (error);
 
 	struct vop_deleteextattr_args vda = {
 		.a_vp = ap->a_vp,
 		.a_attrnamespace = ap->a_attrnamespace,
 		.a_name = ap->a_name,
 		.a_cred = ap->a_cred,
 		.a_td = ap->a_td,
 	};
 	error = ENOENT;
 	if (zfsvfs->z_use_sa && zp->z_is_sa && zfsvfs->z_xattr_sa) {
 		error = zfs_setextattr_sa(ap, attrname);
 		if (error == 0) {
 			/*
 			 * Successfully put into SA, we need to clear the one
 			 * in dir if present.
 			 */
 			zfs_deleteextattr_dir(&vda, attrname);
 		}
 	}
 	if (error != 0) {
 		error = zfs_setextattr_dir(ap, attrname);
 		if (error == 0 && zp->z_is_sa) {
 			/*
 			 * Successfully put into dir, we need to clear the one
 			 * in SA if present.
 			 */
 			zfs_deleteextattr_sa(&vda, attrname);
 		}
 	}
 	if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/*
 		 * Also clear all versions of the alternate compat name.
 		 */
 		zfs_deleteextattr_impl(&vda, !compat);
 	}
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 zfs_setextattr(struct vop_setextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	error = zfs_check_attrname(ap->a_name);
 	if (error != 0)
 		return (error);
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	rw_enter(&zp->z_xattr_lock, RW_WRITER);
 
 	error = zfs_setextattr_impl(ap, zfs_xattr_compat);
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_listextattr {
 	IN struct vnode *a_vp;
 	IN int a_attrnamespace;
 	INOUT struct uio *a_uio;
 	OUT size_t *a_size;
 	IN struct ucred *a_cred;
 	IN struct thread *a_td;
 };
 #endif
 
 static int
 zfs_listextattr_dir(struct vop_listextattr_args *ap, const char *attrprefix)
 {
 	struct thread *td = ap->a_td;
 	struct nameidata nd;
 	uint8_t dirbuf[sizeof (struct dirent)];
 	struct iovec aiov;
 	struct uio auio;
 	vnode_t *xvp = NULL, *vp;
 	int error, eof;
 
 	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred,
 	    LOOKUP_XATTR, B_FALSE);
 	if (error != 0) {
 		/*
 		 * ENOATTR means that the EA directory does not yet exist,
 		 * i.e. there are no extended attributes there.
 		 */
 		if (error == ENOATTR)
 			error = 0;
 		return (error);
 	}
 
 #if __FreeBSD_version < 1400043
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp, td);
 #else
 	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
 	    UIO_SYSSPACE, ".", xvp);
 #endif
 	error = namei(&nd);
 	if (error != 0)
 		return (SET_ERROR(error));
 	vp = nd.ni_vp;
 	NDFREE_PNBUF(&nd);
 
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_rw = UIO_READ;
 	auio.uio_offset = 0;
 
 	size_t plen = strlen(attrprefix);
 
 	do {
 		aiov.iov_base = (void *)dirbuf;
 		aiov.iov_len = sizeof (dirbuf);
 		auio.uio_resid = sizeof (dirbuf);
 		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
 		if (error != 0)
 			break;
 		int done = sizeof (dirbuf) - auio.uio_resid;
 		for (int pos = 0; pos < done; ) {
 			struct dirent *dp = (struct dirent *)(dirbuf + pos);
 			pos += dp->d_reclen;
 			/*
 			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
 			 * is what we get when attribute was created on Solaris.
 			 */
 			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
 				continue;
 			else if (plen == 0 &&
 			    ZFS_XA_NS_PREFIX_FORBIDDEN(dp->d_name))
 				continue;
 			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
 				continue;
 			uint8_t nlen = dp->d_namlen - plen;
 			if (ap->a_size != NULL) {
 				*ap->a_size += 1 + nlen;
 			} else if (ap->a_uio != NULL) {
 				/*
 				 * Format of extattr name entry is one byte for
 				 * length and the rest for name.
 				 */
 				error = uiomove(&nlen, 1, ap->a_uio);
 				if (error == 0) {
 					char *namep = dp->d_name + plen;
 					error = uiomove(namep, nlen, ap->a_uio);
 				}
 				if (error != 0) {
 					error = SET_ERROR(error);
 					break;
 				}
 			}
 		}
 	} while (!eof && error == 0);
 
 	vput(vp);
 	return (error);
 }
 
 static int
 zfs_listextattr_sa(struct vop_listextattr_args *ap, const char *attrprefix)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	int error;
 
 	error = zfs_ensure_xattr_cached(zp);
 	if (error != 0)
 		return (error);
 
 	ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock));
 	ASSERT3P(zp->z_xattr_cached, !=, NULL);
 
 	size_t plen = strlen(attrprefix);
 	nvpair_t *nvp = NULL;
 	while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) {
 		ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY);
 
 		const char *name = nvpair_name(nvp);
 		if (plen == 0 && ZFS_XA_NS_PREFIX_FORBIDDEN(name))
 			continue;
 		else if (strncmp(name, attrprefix, plen) != 0)
 			continue;
 		uint8_t nlen = strlen(name) - plen;
 		if (ap->a_size != NULL) {
 			*ap->a_size += 1 + nlen;
 		} else if (ap->a_uio != NULL) {
 			/*
 			 * Format of extattr name entry is one byte for
 			 * length and the rest for name.
 			 */
 			error = uiomove(&nlen, 1, ap->a_uio);
 			if (error == 0) {
 				char *namep = __DECONST(char *, name) + plen;
 				error = uiomove(namep, nlen, ap->a_uio);
 			}
 			if (error != 0) {
 				error = SET_ERROR(error);
 				break;
 			}
 		}
 	}
 
 	return (error);
 }
 
 static int
 zfs_listextattr_impl(struct vop_listextattr_args *ap, boolean_t compat)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	char attrprefix[16];
 	int error;
 
 	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
 	    sizeof (attrprefix), compat);
 	if (error != 0)
 		return (error);
 
 	if (zfsvfs->z_use_sa && zp->z_is_sa)
 		error = zfs_listextattr_sa(ap, attrprefix);
 	if (error == 0)
 		error = zfs_listextattr_dir(ap, attrprefix);
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 zfs_listextattr(struct vop_listextattr_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	int error;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	/*
 	 * If the xattr property is off, refuse the request.
 	 */
 	if (!(zfsvfs->z_flags & ZSB_XATTR))
 		return (SET_ERROR(EOPNOTSUPP));
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error != 0)
 		return (SET_ERROR(error));
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 	rw_enter(&zp->z_xattr_lock, RW_READER);
 
 	error = zfs_listextattr_impl(ap, zfs_xattr_compat);
 	if (error == 0 && ap->a_attrnamespace == EXTATTR_NAMESPACE_USER) {
 		/* Also list user xattrs with the alternate format. */
 		error = zfs_listextattr_impl(ap, !zfs_xattr_compat);
 	}
 
 	rw_exit(&zp->z_xattr_lock);
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_getacl_args {
 	struct vnode *vp;
 	acl_type_t type;
 	struct acl *aclp;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_getacl(struct vop_getacl_args *ap)
 {
 	int		error;
 	vsecattr_t	vsecattr;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
 	if ((error = zfs_getsecattr(VTOZ(ap->a_vp),
 	    &vsecattr, 0, ap->a_cred)))
 		return (error);
 
 	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp,
 	    vsecattr.vsa_aclcnt);
 	if (vsecattr.vsa_aclentp != NULL)
 		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_setacl_args {
 	struct vnode *vp;
 	acl_type_t type;
 	struct acl *aclp;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_setacl(struct vop_setacl_args *ap)
 {
 	int		error;
 	vsecattr_t vsecattr;
 	int		aclbsize;	/* size of acl list in bytes */
 	aclent_t	*aaclp;
 
 	if (ap->a_type != ACL_TYPE_NFS4)
 		return (EINVAL);
 
 	if (ap->a_aclp == NULL)
 		return (EINVAL);
 
 	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
 		return (EINVAL);
 
 	/*
 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
 	 * splitting every entry into two and appending "canonical six"
 	 * entries at the end.  Don't allow for setting an ACL that would
 	 * cause chmod(2) to run out of ACL entries.
 	 */
 	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
 		return (ENOSPC);
 
 	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
 	if (error != 0)
 		return (error);
 
 	vsecattr.vsa_mask = VSA_ACE;
 	aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t);
 	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
 	aaclp = vsecattr.vsa_aclentp;
 	vsecattr.vsa_aclentsz = aclbsize;
 
 	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
 	error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred);
 	kmem_free(aaclp, aclbsize);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_aclcheck_args {
 	struct vnode *vp;
 	acl_type_t type;
 	struct acl *aclp;
 	struct ucred *cred;
 	struct thread *td;
 };
 #endif
 
 static int
 zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap)
 {
 
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_advise_args {
 	struct vnode *a_vp;
 	off_t a_start;
 	off_t a_end;
 	int a_advice;
 };
 #endif
 
 static int
 zfs_freebsd_advise(struct vop_advise_args *ap)
 {
 	vnode_t *vp = ap->a_vp;
 	off_t start = ap->a_start;
 	off_t end = ap->a_end;
 	int advice = ap->a_advice;
 	off_t len;
 	znode_t *zp;
 	zfsvfs_t *zfsvfs;
 	objset_t *os;
 	int error = 0;
 
 	if (end < start)
 		return (EINVAL);
 
 	error = vn_lock(vp, LK_SHARED);
 	if (error)
 		return (error);
 
 	zp = VTOZ(vp);
 	zfsvfs = zp->z_zfsvfs;
 	os = zp->z_zfsvfs->z_os;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		goto out_unlock;
 
 	/* kern_posix_fadvise points to the last byte, we want one past */
 	if (end != OFF_MAX)
 		end += 1;
 	len = end - start;
 
 	switch (advice) {
 	case POSIX_FADV_WILLNEED:
 		/*
 		 * Pass on the caller's size directly, but note that
 		 * dmu_prefetch_max will effectively cap it.  If there really
 		 * is a larger sequential access pattern, perhaps dmu_zfetch
 		 * will detect it.
 		 */
 		dmu_prefetch(os, zp->z_id, 0, start, len,
 		    ZIO_PRIORITY_ASYNC_READ);
 		break;
 	case POSIX_FADV_NORMAL:
 	case POSIX_FADV_RANDOM:
 	case POSIX_FADV_SEQUENTIAL:
 	case POSIX_FADV_DONTNEED:
 	case POSIX_FADV_NOREUSE:
 		/* ignored for now */
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 
 out_unlock:
 	VOP_UNLOCK(vp);
 
 	return (error);
 }
 
 static int
 zfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	vnode_t *covered_vp;
 	vnode_t *vp = ap->a_vp;
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
 	znode_t *zp = VTOZ(vp);
 	int ltype;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/*
 	 * If we are a snapshot mounted under .zfs, run the operation
 	 * on the covered vnode.
 	 */
 	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
 		char name[MAXNAMLEN + 1];
 		znode_t *dzp;
 		size_t len;
 
 		error = zfs_znode_parent_and_name(zp, &dzp, name,
 		    sizeof (name));
 		if (error == 0) {
 			len = strlen(name);
 			if (*ap->a_buflen < len)
 				error = SET_ERROR(ENOMEM);
 		}
 		if (error == 0) {
 			*ap->a_buflen -= len;
 			memcpy(ap->a_buf + *ap->a_buflen, name, len);
 			*ap->a_vpp = ZTOV(dzp);
 		}
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 	zfs_exit(zfsvfs, FTAG);
 
 	covered_vp = vp->v_mount->mnt_vnodecovered;
 	enum vgetstate vs = vget_prep(covered_vp);
 	ltype = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp);
 	error = vget_finish(covered_vp, LK_SHARED, vs);
 	if (error == 0) {
 		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf,
 		    ap->a_buflen);
 		vput(covered_vp);
 	}
 	vn_lock(vp, ltype | LK_RETRY);
 	if (VN_IS_DOOMED(vp))
 		error = SET_ERROR(ENOENT);
 	return (error);
 }
 
 #if __FreeBSD_version >= 1400032
 static int
 zfs_deallocate(struct vop_deallocate_args *ap)
 {
 	znode_t *zp = VTOZ(ap->a_vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 	zilog_t *zilog;
 	off_t off, len, file_sz;
 	int error;
 
 	if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
 		return (error);
 
 	/*
 	 * Callers might not be able to detect properly that we are read-only,
 	 * so check it explicitly here.
 	 */
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 
 	zilog = zfsvfs->z_log;
 	off = *ap->a_offset;
 	len = *ap->a_len;
 	file_sz = zp->z_size;
 	if (off + len > file_sz)
 		len = file_sz - off;
 	/* Fast path for out-of-range request. */
 	if (len <= 0) {
 		*ap->a_len = 0;
 		zfs_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	error = zfs_freesp(zp, off, len, O_RDWR, TRUE);
 	if (error == 0) {
 		if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS ||
 		    (ap->a_ioflag & IO_SYNC) != 0)
 			error = zil_commit(zilog, zp->z_id);
 		if (error == 0) {
 			*ap->a_offset = off + len;
 			*ap->a_len = 0;
 		}
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 	return (error);
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct vop_copy_file_range_args {
 	struct vnode *a_invp;
 	off_t *a_inoffp;
 	struct vnode *a_outvp;
 	off_t *a_outoffp;
 	size_t *a_lenp;
 	unsigned int a_flags;
 	struct ucred *a_incred;
 	struct ucred *a_outcred;
 	struct thread *a_fsizetd;
 }
 #endif
 /*
  * TODO: FreeBSD will only call file system-specific copy_file_range() if both
  * files resides under the same mountpoint. In case of ZFS we want to be called
  * even is files are in different datasets (but on the same pools, but we need
  * to check that ourselves).
  */
 static int
 zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
 {
 	zfsvfs_t *outzfsvfs;
 	struct vnode *invp = ap->a_invp;
 	struct vnode *outvp = ap->a_outvp;
 	struct mount *mp;
 	int error;
 	uint64_t len = *ap->a_lenp;
 
 	if (!zfs_bclone_enabled) {
 		mp = NULL;
 		goto bad_write_fallback;
 	}
 
 	/*
 	 * TODO: If offset/length is not aligned to recordsize, use
 	 * vn_generic_copy_file_range() on this fragment.
 	 * It would be better to do this after we lock the vnodes, but then we
 	 * need something else than vn_generic_copy_file_range().
 	 */
 
 	vn_start_write(outvp, &mp, V_WAIT);
 	if (__predict_true(mp == outvp->v_mount)) {
 		outzfsvfs = (zfsvfs_t *)mp->mnt_data;
 		if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os),
 		    SPA_FEATURE_BLOCK_CLONING)) {
 			goto bad_write_fallback;
 		}
 	}
 	if (invp == outvp) {
 		if (vn_lock(outvp, LK_EXCLUSIVE) != 0) {
 			goto bad_write_fallback;
 		}
 	} else {
 #if (__FreeBSD_version >= 1302506 && __FreeBSD_version < 1400000) || \
 	__FreeBSD_version >= 1400086
 		vn_lock_pair(invp, false, LK_SHARED, outvp, false,
 		    LK_EXCLUSIVE);
 #else
 		vn_lock_pair(invp, false, outvp, false);
 #endif
 		if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) {
 			goto bad_locked_fallback;
 		}
 	}
 
 #ifdef MAC
 	error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
 	    outvp);
 	if (error != 0)
 		goto out_locked;
 #endif
 
 	error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
 	    ap->a_outoffp, &len, ap->a_outcred);
 	if (error == EXDEV || error == EAGAIN || error == EINVAL ||
 	    error == EOPNOTSUPP)
 		goto bad_locked_fallback;
 	*ap->a_lenp = (size_t)len;
 #ifdef MAC
 out_locked:
 #endif
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);
 	if (mp != NULL)
 		vn_finished_write(mp);
 	return (error);
 
 bad_locked_fallback:
 	if (invp != outvp)
 		VOP_UNLOCK(invp);
 	VOP_UNLOCK(outvp);
 bad_write_fallback:
 	if (mp != NULL)
 		vn_finished_write(mp);
 	error = ENOSYS;
 	return (error);
 }
 
 struct vop_vector zfs_vnodeops;
 struct vop_vector zfs_fifoops;
 struct vop_vector zfs_shareops;
 
 struct vop_vector zfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_need_inactive =	zfs_freebsd_need_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fplookup_vexec = zfs_freebsd_fplookup_vexec,
 	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
 	.vop_access =		zfs_freebsd_access,
 	.vop_allocate =		VOP_EOPNOTSUPP,
 #if __FreeBSD_version >= 1400032
 	.vop_deallocate =	zfs_deallocate,
 #endif
 	.vop_lookup =		zfs_cache_lookup,
 	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_create =		zfs_freebsd_create,
 	.vop_mknod =		(vop_mknod_t *)zfs_freebsd_create,
 	.vop_mkdir =		zfs_freebsd_mkdir,
 	.vop_readdir =		zfs_freebsd_readdir,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_open =		zfs_freebsd_open,
 	.vop_close =		zfs_freebsd_close,
 	.vop_rmdir =		zfs_freebsd_rmdir,
 	.vop_ioctl =		zfs_freebsd_ioctl,
 	.vop_link =		zfs_freebsd_link,
 	.vop_symlink =		zfs_freebsd_symlink,
 	.vop_readlink =		zfs_freebsd_readlink,
 	.vop_advise =		zfs_freebsd_advise,
 	.vop_read =		zfs_freebsd_read,
 	.vop_write =		zfs_freebsd_write,
 	.vop_remove =		zfs_freebsd_remove,
 	.vop_rename =		zfs_freebsd_rename,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 	.vop_bmap =		zfs_freebsd_bmap,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getextattr =	zfs_getextattr,
 	.vop_deleteextattr =	zfs_deleteextattr,
 	.vop_setextattr =	zfs_setextattr,
 	.vop_listextattr =	zfs_listextattr,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 	.vop_getpages =		zfs_freebsd_getpages,
 	.vop_putpages =		zfs_freebsd_putpages,
 	.vop_vptocnp =		zfs_vptocnp,
 	.vop_lock1 =		vop_lock,
 	.vop_unlock =		vop_unlock,
 	.vop_islocked =		vop_islocked,
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 	.vop_copy_file_range =	zfs_freebsd_copy_file_range,
 };
 VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
 
 struct vop_vector zfs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		zfs_freebsd_fsync,
 	.vop_fplookup_vexec =	zfs_freebsd_fplookup_vexec,
 	.vop_fplookup_symlink = zfs_freebsd_fplookup_symlink,
 	.vop_access =		zfs_freebsd_access,
 	.vop_getattr =		zfs_freebsd_getattr,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_setattr =		zfs_freebsd_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_pathconf = 	zfs_freebsd_pathconf,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_getacl =		zfs_freebsd_getacl,
 	.vop_setacl =		zfs_freebsd_setacl,
 	.vop_aclcheck =		zfs_freebsd_aclcheck,
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_fifoops);
 
 /*
  * special share hidden files vnode operations template
  */
 struct vop_vector zfs_shareops = {
 	.vop_default =		&default_vnodeops,
 	.vop_fplookup_vexec =	VOP_EAGAIN,
 	.vop_fplookup_symlink =	VOP_EAGAIN,
 	.vop_access =		zfs_freebsd_access,
 	.vop_inactive =		zfs_freebsd_inactive,
 	.vop_reclaim =		zfs_freebsd_reclaim,
 	.vop_fid =		zfs_freebsd_fid,
 	.vop_pathconf =		zfs_freebsd_pathconf,
 #if __FreeBSD_version >= 1400043
 	.vop_add_writecount =	vop_stdadd_writecount_nomsync,
 #endif
 };
 VFS_VOP_VECTOR_REGISTER(zfs_shareops);
 
 ZFS_MODULE_PARAM(zfs, zfs_, xattr_compat, INT, ZMOD_RW,
 	"Use legacy ZFS xattr naming for writing new user namespace xattrs");
diff --git a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
index 48dae79a2373..81ac26cb0c93 100644
--- a/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
+++ b/sys/contrib/openzfs/module/os/linux/zfs/zpl_ctldir.c
@@ -1,626 +1,667 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * LLNL-CODE-403049.
  * Rewritten for Linux by:
  *   Rohan Puri <rohan.puri15@gmail.com>
  *   Brian Behlendorf <behlendorf1@llnl.gov>
  */
 
 #include <sys/zfs_znode.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zpl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dataset.h>
 #include <sys/zap.h>
 
 /*
  * Common open routine.  Disallow any write access.
  */
 static int
 zpl_common_open(struct inode *ip, struct file *filp)
 {
 	if (blk_mode_is_open_write(filp->f_mode))
 		return (-EACCES);
 
 	return (generic_file_open(ip, filp));
 }
 
 /*
  * Get root directory contents.
  */
 static int
 zpl_root_iterate(struct file *filp, struct dir_context *ctx)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
 	int error = 0;
 
 	if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {
 		return (SET_ERROR(ENOENT));
 	}
 
 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (!dir_emit_dots(filp, ctx))
 		goto out;
 
 	if (ctx->pos == 2) {
 		if (!dir_emit(ctx, ZFS_SNAPDIR_NAME,
 		    strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR))
 			goto out;
 
 		ctx->pos++;
 	}
 
 	if (ctx->pos == 3) {
 		if (!dir_emit(ctx, ZFS_SHAREDIR_NAME,
 		    strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR))
 			goto out;
 
 		ctx->pos++;
 	}
 out:
 	zpl_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Get root directory attributes.
  */
 static int
 #ifdef HAVE_IDMAP_IOPS_GETATTR
 zpl_root_getattr_impl(struct mnt_idmap *user_ns,
     const struct path *path, struct kstat *stat, u32 request_mask,
     unsigned int query_flags)
 #elif defined(HAVE_USERNS_IOPS_GETATTR)
 zpl_root_getattr_impl(struct user_namespace *user_ns,
     const struct path *path, struct kstat *stat, u32 request_mask,
     unsigned int query_flags)
 #else
 zpl_root_getattr_impl(const struct path *path, struct kstat *stat,
     u32 request_mask, unsigned int query_flags)
 #endif
 {
 	(void) request_mask, (void) query_flags;
 	struct inode *ip = path->dentry->d_inode;
 
 #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
 #ifdef HAVE_GENERIC_FILLATTR_USERNS
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 	generic_fillattr(user_ns, request_mask, ip, stat);
 #else
 	(void) user_ns;
 #endif
 #else
 	generic_fillattr(ip, stat);
 #endif
 	stat->atime = current_time(ip);
 
 	return (0);
 }
 ZPL_GETATTR_WRAPPER(zpl_root_getattr);
 
 static struct dentry *
 zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags)
 {
 	cred_t *cr = CRED();
 	struct inode *ip;
 	int error;
 
 	crhold(cr);
 	error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL);
 	ASSERT3S(error, <=, 0);
 	crfree(cr);
 
 	if (error) {
 		if (error == -ENOENT)
 			return (d_splice_alias(NULL, dentry));
 		else
 			return (ERR_PTR(error));
 	}
 
 	return (d_splice_alias(ip, dentry));
 }
 
 /*
  * The '.zfs' control directory file and inode operations.
  */
 const struct file_operations zpl_fops_root = {
 	.open		= zpl_common_open,
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= zpl_root_iterate,
 };
 
 const struct inode_operations zpl_ops_root = {
 	.lookup		= zpl_root_lookup,
 	.getattr	= zpl_root_getattr,
 };
 
 static struct vfsmount *
 zpl_snapdir_automount(struct path *path)
 {
 	int error;
 
 	error = -zfsctl_snapshot_mount(path, 0);
 	if (error)
 		return (ERR_PTR(error));
 
 	/*
 	 * Rather than returning the new vfsmount for the snapshot we must
 	 * return NULL to indicate a mount collision.  This is done because
 	 * the user space mount calls do_add_mount() which adds the vfsmount
 	 * to the name space.  If we returned the new mount here it would be
 	 * added again to the vfsmount list resulting in list corruption.
 	 */
 	return (NULL);
 }
 
 /*
  * Negative dentries must always be revalidated so newly created snapshots
  * can be detected and automounted.  Normal dentries should be kept because
  * as of the 3.18 kernel revaliding the mountpoint dentry will result in
  * the snapshot being immediately unmounted.
  */
 #ifdef HAVE_D_REVALIDATE_4ARGS
 static int
 zpl_snapdir_revalidate(struct inode *dir, const struct qstr *name,
     struct dentry *dentry, unsigned int flags)
 #else
 static int
 zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags)
 #endif
 {
 	return (!!dentry->d_inode);
 }
 
-static dentry_operations_t zpl_dops_snapdirs = {
+static const struct dentry_operations zpl_dops_snapdirs = {
 /*
  * Auto mounting of snapshots is only supported for 2.6.37 and
  * newer kernels.  Prior to this kernel the ops->follow_link()
  * callback was used as a hack to trigger the mount.  The
  * resulting vfsmount was then explicitly grafted in to the
  * name space.  While it might be possible to add compatibility
  * code to accomplish this it would require considerable care.
  */
 	.d_automount	= zpl_snapdir_automount,
 	.d_revalidate	= zpl_snapdir_revalidate,
 };
 
+/*
+ * For the .zfs control directory to work properly we must be able to override
+ * the default operations table and register custom .d_automount and
+ * .d_revalidate callbacks.
+ */
+static void
+set_snapdir_dentry_ops(struct dentry *dentry, unsigned int extraflags) {
+	static const unsigned int op_flags =
+	    DCACHE_OP_HASH | DCACHE_OP_COMPARE |
+	    DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE |
+	    DCACHE_OP_PRUNE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_REAL;
+
+#ifdef HAVE_D_SET_D_OP
+	/*
+	 * d_set_d_op() will set the DCACHE_OP_ flags according to what it
+	 * finds in the passed dentry_operations, so we don't have to.
+	 *
+	 * We clear the flags and the old op table before calling d_set_d_op()
+	 * because issues a warning when the dentry operations table is already
+	 * set.
+	 */
+	dentry->d_op = NULL;
+	dentry->d_flags &= ~op_flags;
+	d_set_d_op(dentry, &zpl_dops_snapdirs);
+	dentry->d_flags |= extraflags;
+#else
+	/*
+	 * Since 6.17 there's no exported way to modify dentry ops, so we have
+	 * to reach in and do it ourselves. This should be safe for our very
+	 * narrow use case, which is to create or splice in an entry to give
+	 * access to a snapshot.
+	 *
+	 * We need to set the op flags directly. We hardcode
+	 * DCACHE_OP_REVALIDATE because that's the only operation we have; if
+	 * we ever extend zpl_dops_snapdirs we will need to update the op flags
+	 * to match.
+	 */
+	spin_lock(&dentry->d_lock);
+	dentry->d_op = &zpl_dops_snapdirs;
+	dentry->d_flags &= ~op_flags;
+	dentry->d_flags |= DCACHE_OP_REVALIDATE | extraflags;
+	spin_unlock(&dentry->d_lock);
+#endif
+}
+
 static struct dentry *
 zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry,
     unsigned int flags)
 {
 	fstrans_cookie_t cookie;
 	cred_t *cr = CRED();
 	struct inode *ip = NULL;
 	int error;
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip,
 	    0, cr, NULL, NULL);
 	ASSERT3S(error, <=, 0);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	if (error && error != -ENOENT)
 		return (ERR_PTR(error));
 
 	ASSERT(error == 0 || ip == NULL);
-	d_clear_d_op(dentry);
-	d_set_d_op(dentry, &zpl_dops_snapdirs);
-	dentry->d_flags |= DCACHE_NEED_AUTOMOUNT;
-
+	set_snapdir_dentry_ops(dentry, DCACHE_NEED_AUTOMOUNT);
 	return (d_splice_alias(ip, dentry));
 }
 
 static int
 zpl_snapdir_iterate(struct file *filp, struct dir_context *ctx)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
 	fstrans_cookie_t cookie;
 	char snapname[MAXNAMELEN];
 	boolean_t case_conflict;
 	uint64_t id, pos;
 	int error = 0;
 
 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 	cookie = spl_fstrans_mark();
 
 	if (!dir_emit_dots(filp, ctx))
 		goto out;
 
 	/* Start the position at 0 if it already emitted . and .. */
 	pos = (ctx->pos == 2 ? 0 : ctx->pos);
 	while (error == 0) {
 		dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN,
 		    snapname, &id, &pos, &case_conflict);
 		dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
 		if (error)
 			goto out;
 
 		if (!dir_emit(ctx, snapname, strlen(snapname),
 		    ZFSCTL_INO_SHARES - id, DT_DIR))
 			goto out;
 
 		ctx->pos = pos;
 	}
 out:
 	spl_fstrans_unmark(cookie);
 	zpl_exit(zfsvfs, FTAG);
 
 	if (error == -ENOENT)
 		return (0);
 
 	return (error);
 }
 
 static int
 #ifdef HAVE_IOPS_RENAME_USERNS
 zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip,
     struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
     unsigned int flags)
 #elif defined(HAVE_IOPS_RENAME_IDMAP)
 zpl_snapdir_rename2(struct mnt_idmap *user_ns, struct inode *sdip,
     struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry,
     unsigned int flags)
 #else
 zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry,
     struct inode *tdip, struct dentry *tdentry, unsigned int flags)
 #endif
 {
 	cred_t *cr = CRED();
 	int error;
 
 	/* We probably don't want to support renameat2(2) in ctldir */
 	if (flags)
 		return (-EINVAL);
 
 	crhold(cr);
 	error = -zfsctl_snapdir_rename(sdip, dname(sdentry),
 	    tdip, dname(tdentry), cr, 0);
 	ASSERT3S(error, <=, 0);
 	crfree(cr);
 
 	return (error);
 }
 
 #if (!defined(HAVE_RENAME_WANTS_FLAGS) && \
 	!defined(HAVE_IOPS_RENAME_USERNS) && \
 	!defined(HAVE_IOPS_RENAME_IDMAP))
 static int
 zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry,
     struct inode *tdip, struct dentry *tdentry)
 {
 	return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0));
 }
 #endif
 
 static int
 zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry)
 {
 	cred_t *cr = CRED();
 	int error;
 
 	crhold(cr);
 	error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0);
 	ASSERT3S(error, <=, 0);
 	crfree(cr);
 
 	return (error);
 }
 
 #if defined(HAVE_IOPS_MKDIR_USERNS)
 static int
 zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip,
     struct dentry *dentry, umode_t mode)
 #elif defined(HAVE_IOPS_MKDIR_IDMAP)
 static int
 zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip,
     struct dentry *dentry, umode_t mode)
 #elif defined(HAVE_IOPS_MKDIR_DENTRY)
 static struct dentry *
 zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip,
     struct dentry *dentry, umode_t mode)
 #else
 static int
 zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode)
 #endif
 {
 	cred_t *cr = CRED();
 	vattr_t *vap;
 	struct inode *ip;
 	int error;
 
 	crhold(cr);
 	vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP);
 #if (defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP))
 	zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns);
 #else
 	zpl_vap_init(vap, dip, mode | S_IFDIR, cr, zfs_init_idmap);
 #endif
 
 	error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0);
 	if (error == 0) {
-		d_clear_d_op(dentry);
-		d_set_d_op(dentry, &zpl_dops_snapdirs);
+		set_snapdir_dentry_ops(dentry, 0);
 		d_instantiate(dentry, ip);
 	}
 
 	kmem_free(vap, sizeof (vattr_t));
 	ASSERT3S(error, <=, 0);
 	crfree(cr);
 
 #if defined(HAVE_IOPS_MKDIR_DENTRY)
 	return (ERR_PTR(error));
 #else
 	return (error);
 #endif
 }
 
 /*
  * Get snapshot directory attributes.
  */
 static int
 #ifdef HAVE_IDMAP_IOPS_GETATTR
 zpl_snapdir_getattr_impl(struct mnt_idmap *user_ns,
     const struct path *path, struct kstat *stat, u32 request_mask,
     unsigned int query_flags)
 #elif defined(HAVE_USERNS_IOPS_GETATTR)
 zpl_snapdir_getattr_impl(struct user_namespace *user_ns,
     const struct path *path, struct kstat *stat, u32 request_mask,
     unsigned int query_flags)
 #else
 zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat,
     u32 request_mask, unsigned int query_flags)
 #endif
 {
 	(void) request_mask, (void) query_flags;
 	struct inode *ip = path->dentry->d_inode;
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	int error;
 
 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
 #ifdef HAVE_GENERIC_FILLATTR_USERNS
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
 	generic_fillattr(user_ns, ip, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 	generic_fillattr(user_ns, request_mask, ip, stat);
 #else
 	(void) user_ns;
 #endif
 #else
 	generic_fillattr(ip, stat);
 #endif
 
 	stat->nlink = stat->size = 2;
 
 	dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
 		uint64_t snap_count;
 		int err = zap_count(
 		    dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
 		    dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
 		if (err != 0) {
 			zpl_exit(zfsvfs, FTAG);
 			return (-err);
 		}
 		stat->nlink += snap_count;
 	}
 
 	stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
 	stat->atime = current_time(ip);
 	zpl_exit(zfsvfs, FTAG);
 
 	return (0);
 }
 ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr);
 
 /*
  * The '.zfs/snapshot' directory file operations.  These mainly control
  * generating the list of available snapshots when doing an 'ls' in the
  * directory.  See zpl_snapdir_readdir().
  */
 const struct file_operations zpl_fops_snapdir = {
 	.open		= zpl_common_open,
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= zpl_snapdir_iterate,
 
 };
 
 /*
  * The '.zfs/snapshot' directory inode operations.  These mainly control
  * creating an inode for a snapshot directory and initializing the needed
  * infrastructure to automount the snapshot.  See zpl_snapdir_lookup().
  */
 const struct inode_operations zpl_ops_snapdir = {
 	.lookup		= zpl_snapdir_lookup,
 	.getattr	= zpl_snapdir_getattr,
 #if (defined(HAVE_RENAME_WANTS_FLAGS) || \
 	defined(HAVE_IOPS_RENAME_USERNS) || \
 	defined(HAVE_IOPS_RENAME_IDMAP))
 	.rename		= zpl_snapdir_rename2,
 #else
 	.rename		= zpl_snapdir_rename,
 #endif
 	.rmdir		= zpl_snapdir_rmdir,
 	.mkdir		= zpl_snapdir_mkdir,
 };
 
 static struct dentry *
 zpl_shares_lookup(struct inode *dip, struct dentry *dentry,
     unsigned int flags)
 {
 	fstrans_cookie_t cookie;
 	cred_t *cr = CRED();
 	struct inode *ip = NULL;
 	int error;
 
 	crhold(cr);
 	cookie = spl_fstrans_mark();
 	error = -zfsctl_shares_lookup(dip, dname(dentry), &ip,
 	    0, cr, NULL, NULL);
 	ASSERT3S(error, <=, 0);
 	spl_fstrans_unmark(cookie);
 	crfree(cr);
 
 	if (error) {
 		if (error == -ENOENT)
 			return (d_splice_alias(NULL, dentry));
 		else
 			return (ERR_PTR(error));
 	}
 
 	return (d_splice_alias(ip, dentry));
 }
 
 static int
 zpl_shares_iterate(struct file *filp, struct dir_context *ctx)
 {
 	fstrans_cookie_t cookie;
 	cred_t *cr = CRED();
 	zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp));
 	znode_t *dzp;
 	int error = 0;
 
 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 	cookie = spl_fstrans_mark();
 
 	if (zfsvfs->z_shares_dir == 0) {
 		dir_emit_dots(filp, ctx);
 		goto out;
 	}
 
 	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
 	if (error)
 		goto out;
 
 	crhold(cr);
 	error = -zfs_readdir(ZTOI(dzp), ctx, cr);
 	crfree(cr);
 
 	iput(ZTOI(dzp));
 out:
 	spl_fstrans_unmark(cookie);
 	zpl_exit(zfsvfs, FTAG);
 	ASSERT3S(error, <=, 0);
 
 	return (error);
 }
 
 static int
 #ifdef HAVE_USERNS_IOPS_GETATTR
 zpl_shares_getattr_impl(struct user_namespace *user_ns,
     const struct path *path, struct kstat *stat, u32 request_mask,
     unsigned int query_flags)
 #elif defined(HAVE_IDMAP_IOPS_GETATTR)
 zpl_shares_getattr_impl(struct mnt_idmap *user_ns,
     const struct path *path, struct kstat *stat, u32 request_mask,
     unsigned int query_flags)
 #else
 zpl_shares_getattr_impl(const struct path *path, struct kstat *stat,
     u32 request_mask, unsigned int query_flags)
 #endif
 {
 	(void) request_mask, (void) query_flags;
 	struct inode *ip = path->dentry->d_inode;
 	zfsvfs_t *zfsvfs = ITOZSB(ip);
 	znode_t *dzp;
 	int error;
 
 	if ((error = zpl_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (zfsvfs->z_shares_dir == 0) {
 #if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
 #ifdef HAVE_GENERIC_FILLATTR_USERNS
 		generic_fillattr(user_ns, path->dentry->d_inode, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP)
 		generic_fillattr(user_ns, path->dentry->d_inode, stat);
 #elif defined(HAVE_GENERIC_FILLATTR_IDMAP_REQMASK)
 	generic_fillattr(user_ns, request_mask, ip, stat);
 #else
 		(void) user_ns;
 #endif
 #else
 		generic_fillattr(path->dentry->d_inode, stat);
 #endif
 		stat->nlink = stat->size = 2;
 		stat->atime = current_time(ip);
 		zpl_exit(zfsvfs, FTAG);
 		return (0);
 	}
 
 	error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp);
 	if (error == 0) {
 #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK
 		error = -zfs_getattr_fast(user_ns, request_mask, ZTOI(dzp),
 		    stat);
 #elif (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR))
 		error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat);
 #else
 		error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat);
 #endif
 		iput(ZTOI(dzp));
 	}
 
 	zpl_exit(zfsvfs, FTAG);
 	ASSERT3S(error, <=, 0);
 
 	return (error);
 }
 ZPL_GETATTR_WRAPPER(zpl_shares_getattr);
 
 /*
  * The '.zfs/shares' directory file operations.
  */
 const struct file_operations zpl_fops_shares = {
 	.open		= zpl_common_open,
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.iterate_shared	= zpl_shares_iterate,
 };
 
 /*
  * The '.zfs/shares' directory inode operations.
  */
 const struct inode_operations zpl_ops_shares = {
 	.lookup		= zpl_shares_lookup,
 	.getattr	= zpl_shares_getattr,
 };
diff --git a/sys/contrib/openzfs/module/zfs/arc.c b/sys/contrib/openzfs/module/zfs/arc.c
index df41e3b49204..bd6dc8edd8ca 100644
--- a/sys/contrib/openzfs/module/zfs/arc.c
+++ b/sys/contrib/openzfs/module/zfs/arc.c
@@ -1,11296 +1,11296 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2018, Joyent, Inc.
  * Copyright (c) 2011, 2020, Delphix. All rights reserved.
  * Copyright (c) 2014, Saso Kiselkov. All rights reserved.
  * Copyright (c) 2017, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2020, George Amanakis. All rights reserved.
  * Copyright (c) 2019, 2024, 2025, Klara, Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2020, The FreeBSD Foundation [1]
  * Copyright (c) 2021, 2024 by George Melikov. All rights reserved.
  *
  * [1] Portions of this software were developed by Allan Jude
  *     under sponsorship from the FreeBSD Foundation.
  */
 
 /*
  * DVA-based Adjustable Replacement Cache
  *
  * While much of the theory of operation used here is
  * based on the self-tuning, low overhead replacement cache
  * presented by Megiddo and Modha at FAST 2003, there are some
  * significant differences:
  *
  * 1. The Megiddo and Modha model assumes any page is evictable.
  * Pages in its cache cannot be "locked" into memory.  This makes
  * the eviction algorithm simple: evict the last page in the list.
  * This also make the performance characteristics easy to reason
  * about.  Our cache is not so simple.  At any given moment, some
  * subset of the blocks in the cache are un-evictable because we
  * have handed out a reference to them.  Blocks are only evictable
  * when there are no external references active.  This makes
  * eviction far more problematic:  we choose to evict the evictable
  * blocks that are the "lowest" in the list.
  *
  * There are times when it is not possible to evict the requested
  * space.  In these circumstances we are unable to adjust the cache
  * size.  To prevent the cache growing unbounded at these times we
  * implement a "cache throttle" that slows the flow of new data
  * into the cache until we can make space available.
  *
  * 2. The Megiddo and Modha model assumes a fixed cache size.
  * Pages are evicted when the cache is full and there is a cache
  * miss.  Our model has a variable sized cache.  It grows with
  * high use, but also tries to react to memory pressure from the
  * operating system: decreasing its size when system memory is
  * tight.
  *
  * 3. The Megiddo and Modha model assumes a fixed page size. All
  * elements of the cache are therefore exactly the same size.  So
  * when adjusting the cache size following a cache miss, its simply
  * a matter of choosing a single page to evict.  In our model, we
  * have variable sized cache blocks (ranging from 512 bytes to
  * 128K bytes).  We therefore choose a set of blocks to evict to make
  * space for a cache miss that approximates as closely as possible
  * the space used by the new block.
  *
  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
  * by N. Megiddo & D. Modha, FAST 2003
  */
 
 /*
  * The locking model:
  *
  * A new reference to a cache buffer can be obtained in two
  * ways: 1) via a hash table lookup using the DVA as a key,
  * or 2) via one of the ARC lists.  The arc_read() interface
  * uses method 1, while the internal ARC algorithms for
  * adjusting the cache use method 2.  We therefore provide two
  * types of locks: 1) the hash table lock array, and 2) the
  * ARC list locks.
  *
  * Buffers do not have their own mutexes, rather they rely on the
  * hash table mutexes for the bulk of their protection (i.e. most
  * fields in the arc_buf_hdr_t are protected by these mutexes).
  *
  * buf_hash_find() returns the appropriate mutex (held) when it
  * locates the requested buffer in the hash table.  It returns
  * NULL for the mutex if the buffer was not in the table.
  *
  * buf_hash_remove() expects the appropriate hash mutex to be
  * already held before it is invoked.
  *
  * Each ARC state also has a mutex which is used to protect the
  * buffer list associated with the state.  When attempting to
  * obtain a hash table lock while holding an ARC list lock you
  * must use: mutex_tryenter() to avoid deadlock.  Also note that
  * the active state mutex must be held before the ghost state mutex.
  *
  * It as also possible to register a callback which is run when the
  * metadata limit is reached and no buffers can be safely evicted.  In
  * this case the arc user should drop a reference on some arc buffers so
  * they can be reclaimed.  For example, when using the ZPL each dentry
  * holds a references on a znode.  These dentries must be pruned before
  * the arc buffer holding the znode can be safely evicted.
  *
  * Note that the majority of the performance stats are manipulated
  * with atomic operations.
  *
  * The L2ARC uses the l2ad_mtx on each vdev for the following:
  *
  *	- L2ARC buflist creation
  *	- L2ARC buflist eviction
  *	- L2ARC write completion, which walks L2ARC buflists
  *	- ARC header destruction, as it removes from L2ARC buflists
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
 /*
  * ARC operation:
  *
  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
  * This structure can point either to a block that is still in the cache or to
  * one that is only accessible in an L2 ARC device, or it can provide
  * information about a block that was recently evicted. If a block is
  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
  * information to retrieve it from the L2ARC device. This information is
  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
  * that is in this state cannot access the data directly.
  *
  * Blocks that are actively being referenced or have not been evicted
  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
  * the arc_buf_hdr_t that will point to the data block in memory. A block can
  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
  *
  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
  * ability to store the physical data (b_pabd) associated with the DVA of the
  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
  * it will match its on-disk compression characteristics. This behavior can be
  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
  * compressed ARC functionality is disabled, the b_pabd will point to an
  * uncompressed version of the on-disk data.
  *
  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
  * consumer. The ARC will provide references to this data and will keep it
  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
  * data block and will evict any arc_buf_t that is no longer referenced. The
  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
  * "overhead_size" kstat.
  *
  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
  * compressed form. The typical case is that consumers will want uncompressed
  * data, and when that happens a new data buffer is allocated where the data is
  * decompressed for them to use. Currently the only consumer who wants
  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
  * with the arc_buf_hdr_t.
  *
  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
  * first one is owned by a compressed send consumer (and therefore references
  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
  * used by any other consumer (and has its own uncompressed copy of the data
  * buffer).
  *
  *   arc_buf_hdr_t
  *   +-----------+
  *   | fields    |
  *   | common to |
  *   | L1- and   |
  *   | L2ARC     |
  *   +-----------+
  *   | l2arc_buf_hdr_t
  *   |           |
  *   +-----------+
  *   | l1arc_buf_hdr_t
  *   |           |              arc_buf_t
  *   | b_buf     +------------>+-----------+      arc_buf_t
  *   | b_pabd    +-+           |b_next     +---->+-----------+
  *   +-----------+ |           |-----------|     |b_next     +-->NULL
  *                 |           |b_comp = T |     +-----------+
  *                 |           |b_data     +-+   |b_comp = F |
  *                 |           +-----------+ |   |b_data     +-+
  *                 +->+------+               |   +-----------+ |
  *        compressed  |      |               |                 |
  *           data     |      |<--------------+                 | uncompressed
  *                    +------+          compressed,            |     data
  *                                        shared               +-->+------+
  *                                         data                    |      |
  *                                                                 |      |
  *                                                                 +------+
  *
  * When a consumer reads a block, the ARC must first look to see if the
  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
  * arc_buf_t and either copies uncompressed data into a new data buffer from an
  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
  * hdr is compressed and the desired compression characteristics of the
  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
  * be anywhere in the hdr's list.
  *
  * The diagram below shows an example of an uncompressed ARC hdr that is
  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
  * the last element in the buf list):
  *
  *                arc_buf_hdr_t
  *                +-----------+
  *                |           |
  *                |           |
  *                |           |
  *                +-----------+
  * l2arc_buf_hdr_t|           |
  *                |           |
  *                +-----------+
  * l1arc_buf_hdr_t|           |
  *                |           |                 arc_buf_t    (shared)
  *                |    b_buf  +------------>+---------+      arc_buf_t
  *                |           |             |b_next   +---->+---------+
  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
  *                +-----------+ |           |         |     +---------+
  *                              |           |b_data   +-+   |         |
  *                              |           +---------+ |   |b_data   +-+
  *                              +->+------+             |   +---------+ |
  *                                 |      |             |               |
  *                   uncompressed  |      |             |               |
  *                        data     +------+             |               |
  *                                    ^                 +->+------+     |
  *                                    |       uncompressed |      |     |
  *                                    |           data     |      |     |
  *                                    |                    +------+     |
  *                                    +---------------------------------+
  *
  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
  * since the physical block is about to be rewritten. The new data contents
  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
  * it may compress the data before writing it to disk. The ARC will be called
  * with the transformed data and will memcpy the transformed on-disk block into
  * a newly allocated b_pabd. Writes are always done into buffers which have
  * either been loaned (and hence are new and don't have other readers) or
  * buffers which have been released (and hence have their own hdr, if there
  * were originally other readers of the buf's original hdr). This ensures that
  * the ARC only needs to update a single buf and its hdr after a write occurs.
  *
  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
  * that when compressed ARC is enabled that the L2ARC blocks are identical
  * to the on-disk block in the main data pool. This provides a significant
  * advantage since the ARC can leverage the bp's checksum when reading from the
  * L2ARC to determine if the contents are valid. However, if the compressed
  * ARC is disabled, then the L2ARC's block must be transformed to look
  * like the physical block in the main data pool before comparing the
  * checksum and determining its validity.
  *
  * The L1ARC has a slightly different system for storing encrypted data.
  * Raw (encrypted + possibly compressed) data has a few subtle differences from
  * data that is just compressed. The biggest difference is that it is not
  * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded.
  * The other difference is that encryption cannot be treated as a suggestion.
  * If a caller would prefer compressed data, but they actually wind up with
  * uncompressed data the worst thing that could happen is there might be a
  * performance hit. If the caller requests encrypted data, however, we must be
  * sure they actually get it or else secret information could be leaked. Raw
  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
  * may have both an encrypted version and a decrypted version of its data at
  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
  * copied out of this header. To avoid complications with b_pabd, raw buffers
  * cannot be shared.
  */
 
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/zfs_refcount.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dsl_pool.h>
 #include <sys/multilist.h>
 #include <sys/abd.h>
 #include <sys/dbuf.h>
 #include <sys/zil.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/callb.h>
 #include <sys/kstat.h>
 #include <sys/zthr.h>
 #include <zfs_fletcher.h>
 #include <sys/arc_impl.h>
 #include <sys/trace_zfs.h>
 #include <sys/aggsum.h>
 #include <sys/wmsum.h>
 #include <cityhash.h>
 #include <sys/vdev_trim.h>
 #include <sys/zfs_racct.h>
 #include <sys/zstd/zstd.h>
 
 #ifndef _KERNEL
 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
 boolean_t arc_watch = B_FALSE;
 #endif
 
 /*
  * This thread's job is to keep enough free memory in the system, by
  * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves
  * arc_available_memory().
  */
 static zthr_t *arc_reap_zthr;
 
 /*
  * This thread's job is to keep arc_size under arc_c, by calling
  * arc_evict(), which improves arc_is_overflowing().
  */
 static zthr_t *arc_evict_zthr;
 static arc_buf_hdr_t **arc_state_evict_markers;
 static int arc_state_evict_marker_count;
 
 static kmutex_t arc_evict_lock;
 static boolean_t arc_evict_needed = B_FALSE;
 static clock_t arc_last_uncached_flush;
 
 static taskq_t *arc_evict_taskq;
 static struct evict_arg *arc_evict_arg;
 
 /*
  * Count of bytes evicted since boot.
  */
 static uint64_t arc_evict_count;
 
 /*
  * List of arc_evict_waiter_t's, representing threads waiting for the
  * arc_evict_count to reach specific values.
  */
 static list_t arc_evict_waiters;
 
 /*
  * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of
  * the requested amount of data to be evicted.  For example, by default for
  * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation.
  * Since this is above 100%, it ensures that progress is made towards getting
  * arc_size under arc_c.  Since this is finite, it ensures that allocations
  * can still happen, even during the potentially long time that arc_size is
  * more than arc_c.
  */
 static uint_t zfs_arc_eviction_pct = 200;
 
 /*
  * The number of headers to evict in arc_evict_state_impl() before
  * dropping the sublist lock and evicting from another sublist. A lower
  * value means we're more likely to evict the "correct" header (i.e. the
  * oldest header in the arc state), but comes with higher overhead
  * (i.e. more invocations of arc_evict_state_impl()).
  */
 static uint_t zfs_arc_evict_batch_limit = 10;
 
 /* number of seconds before growing cache again */
 uint_t arc_grow_retry = 5;
 
 /*
  * Minimum time between calls to arc_kmem_reap_soon().
  */
 static const int arc_kmem_cache_reap_retry_ms = 1000;
 
 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
 static int zfs_arc_overflow_shift = 8;
 
 /* log2(fraction of arc to reclaim) */
 uint_t arc_shrink_shift = 7;
 
 #ifdef _KERNEL
 /* percent of pagecache to reclaim arc to */
 uint_t zfs_arc_pc_percent = 0;
 #endif
 
 /*
  * log2(fraction of ARC which must be free to allow growing).
  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
  * when reading a new block into the ARC, we will evict an equal-sized block
  * from the ARC.
  *
  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
  * we will still not allow it to grow.
  */
 uint_t		arc_no_grow_shift = 5;
 
 
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
 static uint_t		arc_min_prefetch_ms;
 static uint_t		arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
  */
 uint_t arc_lotsfree_percent = 10;
 
 /*
  * The arc has filled available memory and has now warmed up.
  */
 boolean_t arc_warm;
 
 /*
  * These tunables are for performance analysis.
  */
 uint64_t zfs_arc_max = 0;
 uint64_t zfs_arc_min = 0;
 static uint64_t zfs_arc_dnode_limit = 0;
 static uint_t zfs_arc_dnode_reduce_percent = 10;
 static uint_t zfs_arc_grow_retry = 0;
 static uint_t zfs_arc_shrink_shift = 0;
 uint_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
 /*
  * ARC dirty data constraints for arc_tempreserve_space() throttle:
  * * total dirty data limit
  * * anon block dirty limit
  * * each pool's anon allowance
  */
 static const unsigned long zfs_arc_dirty_limit_percent = 50;
 static const unsigned long zfs_arc_anon_limit_percent = 25;
 static const unsigned long zfs_arc_pool_dirty_percent = 20;
 
 /*
  * Enable or disable compressed arc buffers.
  */
 int zfs_compressed_arc_enabled = B_TRUE;
 
 /*
  * Balance between metadata and data on ghost hits.  Values above 100
  * increase metadata caching by proportionally reducing effect of ghost
  * data hits on target data/metadata rate.
  */
 static uint_t zfs_arc_meta_balance = 500;
 
 /*
  * Percentage that can be consumed by dnodes of ARC meta buffers.
  */
 static uint_t zfs_arc_dnode_limit_percent = 10;
 
 /*
  * These tunables are Linux-specific
  */
 static uint64_t zfs_arc_sys_free = 0;
 static uint_t zfs_arc_min_prefetch_ms = 0;
 static uint_t zfs_arc_min_prescient_prefetch_ms = 0;
 static uint_t zfs_arc_lotsfree_percent = 10;
 
 /*
  * Number of arc_prune threads
  */
 static int zfs_arc_prune_task_threads = 1;
 
 /* Used by spa_export/spa_destroy to flush the arc asynchronously */
 static taskq_t *arc_flush_taskq;
 
 /*
  * Controls the number of ARC eviction threads to dispatch sublists to.
  *
  * Possible values:
  * 0  (auto) compute the number of threads using a logarithmic formula.
  * 1  (disabled) one thread - parallel eviction is disabled.
  * 2+ (manual) set the number manually.
  *
  * See arc_evict_thread_init() for how "auto" is computed.
  */
 static uint_t zfs_arc_evict_threads = 0;
 
 /* The 7 states: */
-arc_state_t ARC_anon;
-arc_state_t ARC_mru;
-arc_state_t ARC_mru_ghost;
-arc_state_t ARC_mfu;
-arc_state_t ARC_mfu_ghost;
-arc_state_t ARC_l2c_only;
-arc_state_t ARC_uncached;
+static arc_state_t ARC_anon;
+/*  */ arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+/*  */ arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
+static arc_state_t ARC_uncached;
 
 arc_stats_t arc_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "iohits",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_iohits",		KSTAT_DATA_UINT64 },
 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
 	{ "prefetch_data_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_iohits",	KSTAT_DATA_UINT64 },
 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
 	{ "mru_hits",			KSTAT_DATA_UINT64 },
 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
 	{ "uncached_hits",		KSTAT_DATA_UINT64 },
 	{ "deleted",			KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "access_skip",		KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mfu",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_eligible_mru",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
 	{ "hash_chains",		KSTAT_DATA_UINT64 },
 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
 	{ "meta",			KSTAT_DATA_UINT64 },
 	{ "pd",				KSTAT_DATA_UINT64 },
 	{ "pm",				KSTAT_DATA_UINT64 },
 	{ "c",				KSTAT_DATA_UINT64 },
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "compressed_size",		KSTAT_DATA_UINT64 },
 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
 	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
 	{ "dbuf_size",			KSTAT_DATA_UINT64 },
 	{ "dnode_size",			KSTAT_DATA_UINT64 },
 	{ "bonus_size",			KSTAT_DATA_UINT64 },
 #if defined(COMPAT_FREEBSD11)
 	{ "other_size",			KSTAT_DATA_UINT64 },
 #endif
 	{ "anon_size",			KSTAT_DATA_UINT64 },
 	{ "anon_data",			KSTAT_DATA_UINT64 },
 	{ "anon_metadata",		KSTAT_DATA_UINT64 },
 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_size",			KSTAT_DATA_UINT64 },
 	{ "mru_data",			KSTAT_DATA_UINT64 },
 	{ "mru_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "mfu_size",			KSTAT_DATA_UINT64 },
 	{ "mfu_data",			KSTAT_DATA_UINT64 },
 	{ "mfu_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_data",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_metadata",		KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "uncached_size",		KSTAT_DATA_UINT64 },
 	{ "uncached_data",		KSTAT_DATA_UINT64 },
 	{ "uncached_metadata",		KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_data",	KSTAT_DATA_UINT64 },
 	{ "uncached_evictable_metadata", KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_prefetch_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mru_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_mfu_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_data_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_bufc_metadata_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_avg_asize",	KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_log_blk_count",		KSTAT_DATA_UINT64 },
 	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_success",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_dh_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_cksum_lb_errors",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_asize",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
 	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
 	{ "memory_direct_count",	KSTAT_DATA_UINT64 },
 	{ "memory_indirect_count",	KSTAT_DATA_UINT64 },
 	{ "memory_all_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_free_bytes",		KSTAT_DATA_UINT64 },
 	{ "memory_available_bytes",	KSTAT_DATA_INT64 },
 	{ "arc_no_grow",		KSTAT_DATA_UINT64 },
 	{ "arc_tempreserve",		KSTAT_DATA_UINT64 },
 	{ "arc_loaned_bytes",		KSTAT_DATA_UINT64 },
 	{ "arc_prune",			KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_dnode_limit",		KSTAT_DATA_UINT64 },
 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
 	{ "predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_predictive_prefetch", KSTAT_DATA_UINT64 },
 	{ "prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "demand_iohit_prescient_prefetch", KSTAT_DATA_UINT64 },
 	{ "arc_need_free",		KSTAT_DATA_UINT64 },
 	{ "arc_sys_free",		KSTAT_DATA_UINT64 },
 	{ "arc_raw_size",		KSTAT_DATA_UINT64 },
 	{ "cached_only_in_progress",	KSTAT_DATA_UINT64 },
 	{ "abd_chunk_waste_size",	KSTAT_DATA_UINT64 },
 };
 
 arc_sums_t arc_sums;
 
 #define	ARCSTAT_MAX(stat, val) {					\
 	uint64_t m;							\
 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
 		continue;						\
 }
 
 /*
  * We define a macro to allow ARC hits/misses to be easily broken down by
  * two separate conditions, giving a total of four different subtypes for
  * each of hits and misses (so eight statistics total).
  */
 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
 	if (cond1) {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
 		}							\
 	} else {							\
 		if (cond2) {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
 		} else {						\
 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
 		}							\
 	}
 
 /*
  * This macro allows us to use kstats as floating averages. Each time we
  * update this kstat, we first factor it and the update value by
  * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
  * average. This macro assumes that integer loads and stores are atomic, but
  * is not safe for multiple writers updating the kstat in parallel (only the
  * last writer's update will remain).
  */
 #define	ARCSTAT_F_AVG_FACTOR	3
 #define	ARCSTAT_F_AVG(stat, value) \
 	do { \
 		uint64_t x = ARCSTAT(stat); \
 		x = x - x / ARCSTAT_F_AVG_FACTOR + \
 		    (value) / ARCSTAT_F_AVG_FACTOR; \
 		ARCSTAT(stat) = x; \
 	} while (0)
 
 static kstat_t			*arc_ksp;
 
 /*
  * There are several ARC variables that are critical to export as kstats --
  * but we don't want to have to grovel around in the kstat whenever we wish to
  * manipulate them.  For these variables, we therefore define them to be in
  * terms of the statistic variable.  This assures that we are not introducing
  * the possibility of inconsistency by having shadow copies of the variables,
  * while still allowing the code to be readable.
  */
 #define	arc_tempreserve	ARCSTAT(arcstat_tempreserve)
 #define	arc_loaned_bytes	ARCSTAT(arcstat_loaned_bytes)
 #define	arc_dnode_limit	ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
 #define	arc_need_free	ARCSTAT(arcstat_need_free) /* waiting to be evicted */
 
 hrtime_t arc_growtime;
 list_t arc_prune_list;
 kmutex_t arc_prune_mtx;
 taskq_t *arc_prune_taskq;
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define	HDR_COMPRESSION_ENABLED(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
 #define	HDR_UNCACHED(hdr)	((hdr)->b_flags & ARC_FLAG_UNCACHED)
 #define	HDR_L2_READING(hdr)	\
 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 #define	HDR_HAS_RABD(hdr)	\
 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
 	(hdr)->b_crypt_hdr.b_rabd != NULL)
 #define	HDR_ENCRYPTED(hdr)	\
 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 #define	HDR_AUTHENTICATED(hdr)	\
 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
 
 /* For storing compression mode in b_flags */
 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
 
 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
 
 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
 
 /*
  * Other sizes
  */
 
 #define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
 
 /*
  * Hash table routines
  */
 
 #define	BUF_LOCKS 2048
 typedef struct buf_hash_table {
 	uint64_t ht_mask;
 	arc_buf_hdr_t **ht_table;
 	kmutex_t ht_locks[BUF_LOCKS] ____cacheline_aligned;
 } buf_hash_table_t;
 
 static buf_hash_table_t buf_hash_table;
 
 #define	BUF_HASH_INDEX(spa, dva, birth) \
 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
 #define	BUF_HASH_LOCK(idx)	(&buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
 #define	HDR_LOCK(hdr) \
 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
 
 uint64_t zfs_crc64_table[256];
 
 /*
  * Asynchronous ARC flush
  *
  * We track these in a list for arc_async_flush_guid_inuse().
  * Used for both L1 and L2 async teardown.
  */
 static list_t arc_async_flush_list;
 static kmutex_t	arc_async_flush_lock;
 
 typedef struct arc_async_flush {
 	uint64_t	af_spa_guid;
 	taskq_ent_t	af_tqent;
 	uint_t		af_cache_level;	/* 1 or 2 to differentiate node */
 	list_node_t	af_node;
 } arc_async_flush_t;
 
 
 /*
  * Level 2 ARC
  */
 
 #define	L2ARC_WRITE_SIZE	(32 * 1024 * 1024)	/* initial write max */
 #define	L2ARC_HEADROOM		8			/* num of writes */
 
 /*
  * If we discover during ARC scan any buffers to be compressed, we boost
  * our headroom for the next scanning cycle by this percentage multiple.
  */
 #define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 /*
  * We can feed L2ARC from two states of ARC buffers, mru and mfu,
  * and each of the state has two types: data and metadata.
  */
 #define	L2ARC_FEED_TYPES	4
 
 /* L2ARC Performance Tunables */
-uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
-uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
-uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
-uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
-uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
-uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
-int l2arc_noprefetch = B_TRUE;			/* don't cache prefetch bufs */
-int l2arc_feed_again = B_TRUE;			/* turbo warmup */
-int l2arc_norw = B_FALSE;			/* no reads during writes */
+static uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* def max write size */
+static uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra warmup write */
+static uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* # of dev writes */
+static uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
+static uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+static uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval msecs */
+static int l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
+static int l2arc_feed_again = B_TRUE;		/* turbo warmup */
+static int l2arc_norw = B_FALSE;		/* no reads during writes */
 static uint_t l2arc_meta_percent = 33;	/* limit on headers size */
 
 /*
  * L2ARC Internals
  */
 static list_t L2ARC_dev_list;			/* device list */
 static list_t *l2arc_dev_list;			/* device list pointer */
 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
 static list_t L2ARC_free_on_write;		/* free after write buf list */
 static list_t *l2arc_free_on_write;		/* free after write list ptr */
 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
 	abd_t			*l2rcb_abd;		/* temporary buffer */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	abd_t		*l2df_abd;
 	size_t		l2df_size;
 	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
 typedef enum arc_fill_flags {
 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
 } arc_fill_flags_t;
 
 typedef enum arc_ovf_level {
 	ARC_OVF_NONE,			/* ARC within target size. */
 	ARC_OVF_SOME,			/* ARC is slightly overflowed. */
 	ARC_OVF_SEVERE			/* ARC is severely overflowed. */
 } arc_ovf_level_t;
 
 static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
 static kmutex_t l2arc_rebuild_thr_lock;
 static kcondvar_t l2arc_rebuild_thr_cv;
 
 enum arc_hdr_alloc_flags {
 	ARC_HDR_ALLOC_RDATA = 0x1,
 	ARC_HDR_USE_RESERVE = 0x4,
 	ARC_HDR_ALLOC_LINEAR = 0x8,
 };
 
 
 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, const void *);
 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, const void *, int);
 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, const void *);
 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, const void *);
 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size,
     const void *tag);
 static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t);
 static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int);
 static void arc_hdr_destroy(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, arc_flags_t, boolean_t);
 static void arc_buf_watch(arc_buf_t *);
 static void arc_change_state(arc_state_t *, arc_buf_hdr_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 static void l2arc_do_free_on_write(void);
 static void l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only);
 
 static void arc_prune_async(uint64_t adjust);
 
 #define	l2arc_hdr_arcstats_increment(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_FALSE)
 #define	l2arc_hdr_arcstats_decrement(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_FALSE)
 #define	l2arc_hdr_arcstats_increment_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_TRUE, B_TRUE)
 #define	l2arc_hdr_arcstats_decrement_state(hdr) \
 	l2arc_hdr_arcstats_update((hdr), B_FALSE, B_TRUE)
 
 /*
  * l2arc_exclude_special : A zfs module parameter that controls whether buffers
  * 		present on special vdevs are eligibile for caching in L2ARC. If
  * 		set to 1, exclude dbufs on special vdevs from being cached to
  * 		L2ARC.
  */
 int l2arc_exclude_special = 0;
 
 /*
  * l2arc_mfuonly : A ZFS module parameter that controls whether only MFU
  * 		metadata and data are cached from ARC into L2ARC.
  */
 static int l2arc_mfuonly = 0;
 
 /*
  * L2ARC TRIM
  * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of
  * 		the current write size (l2arc_write_max) we should TRIM if we
  * 		have filled the device. It is defined as a percentage of the
  * 		write size. If set to 100 we trim twice the space required to
  * 		accommodate upcoming writes. A minimum of 64MB will be trimmed.
  * 		It also enables TRIM of the whole L2ARC device upon creation or
  * 		addition to an existing pool or if the header of the device is
  * 		invalid upon importing a pool or onlining a cache device. The
  * 		default is 0, which disables TRIM on L2ARC altogether as it can
  * 		put significant stress on the underlying storage devices. This
  * 		will vary depending of how well the specific device handles
  * 		these commands.
  */
 static uint64_t l2arc_trim_ahead = 0;
 
 /*
  * Performance tuning of L2ARC persistence:
  *
  * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding
  * 		an L2ARC device (either at pool import or later) will attempt
  * 		to rebuild L2ARC buffer contents.
  * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls
  * 		whether log blocks are written to the L2ARC device. If the L2ARC
  * 		device is less than 1GB, the amount of data l2arc_evict()
  * 		evicts is significant compared to the amount of restored L2ARC
  * 		data. In this case do not write log blocks in L2ARC in order
  * 		not to waste space.
  */
 static int l2arc_rebuild_enabled = B_TRUE;
 static uint64_t l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024;
 
 /* L2ARC persistence rebuild control routines. */
 void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen);
 static __attribute__((noreturn)) void l2arc_dev_rebuild_thread(void *arg);
 static int l2arc_rebuild(l2arc_dev_t *dev);
 
 /* L2ARC persistence read I/O routines. */
 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
 static int l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io);
 static zio_t *l2arc_log_blk_fetch(vdev_t *vd,
     const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb);
 static void l2arc_log_blk_fetch_abort(zio_t *zio);
 
 /* L2ARC persistence block restoration routines. */
 static void l2arc_log_blk_restore(l2arc_dev_t *dev,
     const l2arc_log_blk_phys_t *lb, uint64_t lb_asize);
 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
     l2arc_dev_t *dev);
 
 /* L2ARC persistence write I/O routines. */
 static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
     l2arc_write_callback_t *cb);
 
 /* L2ARC persistence auxiliary routines. */
 boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *lbp);
 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
     const arc_buf_hdr_t *ab);
 boolean_t l2arc_range_check_overlap(uint64_t bottom,
     uint64_t top, uint64_t check);
 static void l2arc_blk_fetch_done(zio_t *zio);
 static inline uint64_t
     l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev);
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
 }
 
 #define	HDR_EMPTY(hdr)						\
 	((hdr)->b_dva.dva_word[0] == 0 &&			\
 	(hdr)->b_dva.dva_word[1] == 0)
 
 #define	HDR_EMPTY_OR_LOCKED(hdr)				\
 	(HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr)))
 
 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
 {
 	hdr->b_dva.dva_word[0] = 0;
 	hdr->b_dva.dva_word[1] = 0;
 	hdr->b_birth = 0;
 }
 
 static arc_buf_hdr_t *
 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
 {
 	const dva_t *dva = BP_IDENTITY(bp);
 	uint64_t birth = BP_GET_PHYSICAL_BIRTH(bp);
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *hdr;
 
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
 	}
 	mutex_exit(hash_lock);
 	*lockp = NULL;
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  * If lockp == NULL, the caller is assumed to already hold the hash lock.
  */
 static arc_buf_hdr_t *
 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
 	arc_buf_hdr_t *fhdr;
 	uint32_t i;
 
 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
 	ASSERT(hdr->b_birth != 0);
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (lockp != NULL) {
 		*lockp = hash_lock;
 		mutex_enter(hash_lock);
 	} else {
 		ASSERT(MUTEX_HELD(hash_lock));
 	}
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
 		ARCSTAT_BUMP(arcstat_hash_collisions);
 		if (i == 1)
 			ARCSTAT_BUMP(arcstat_hash_chains);
 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
 	}
 	ARCSTAT_BUMP(arcstat_hash_elements);
 
 	return (NULL);
 }
 
 static void
 buf_hash_remove(arc_buf_hdr_t *hdr)
 {
 	arc_buf_hdr_t *fhdr, **hdrp;
 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
 
 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
 	ASSERT(HDR_IN_HASH_TABLE(hdr));
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
 		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
 	if (buf_hash_table.ht_table[idx] &&
 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
 }
 
 /*
  * Global data structures and functions for the buf kmem cache.
  */
 
 static kmem_cache_t *hdr_full_cache;
 static kmem_cache_t *hdr_l2only_cache;
 static kmem_cache_t *buf_cache;
 
 static void
 buf_fini(void)
 {
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_free() in the linux kernel\
 	 */
 	vmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #else
 	kmem_free(buf_hash_table.ht_table,
 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
 #endif
 	for (int i = 0; i < BUF_LOCKS; i++)
 		mutex_destroy(BUF_HASH_LOCK(i));
 	kmem_cache_destroy(hdr_full_cache);
 	kmem_cache_destroy(hdr_l2only_cache);
 	kmem_cache_destroy(buf_cache);
 }
 
 /*
  * Constructor callback - called when the cache is empty
  * and a new buf is requested.
  */
 static int
 hdr_full_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_FULL_SIZE);
 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
 #endif
 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
 	list_link_init(&hdr->b_l2hdr.b_l2node);
 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 static int
 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	memset(hdr, 0, HDR_L2ONLY_SIZE);
 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 
 	return (0);
 }
 
 static int
 buf_cons(void *vbuf, void *unused, int kmflag)
 {
 	(void) unused, (void) kmflag;
 	arc_buf_t *buf = vbuf;
 
 	memset(buf, 0, sizeof (arc_buf_t));
 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 
 	return (0);
 }
 
 /*
  * Destructor callback - called when a cached buf is
  * no longer required.
  */
 static void
 hdr_full_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 #ifdef ZFS_DEBUG
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
 }
 
 static void
 hdr_l2only_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	arc_buf_hdr_t *hdr = vbuf;
 
 	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
 static void
 buf_dest(void *vbuf, void *unused)
 {
 	(void) unused;
 	(void) vbuf;
 
 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 static void
 buf_init(void)
 {
 	uint64_t *ct = NULL;
 	uint64_t hsize = 1ULL << 12;
 	int i, j;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
 	 * By default, the table will take up
 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
 	 */
 	while (hsize * zfs_arc_average_blocksize < arc_all_memory())
 		hsize <<= 1;
 retry:
 	buf_hash_table.ht_mask = hsize - 1;
 #if defined(_KERNEL)
 	/*
 	 * Large allocations which do not require contiguous pages
 	 * should be using vmem_alloc() in the linux kernel
 	 */
 	buf_hash_table.ht_table =
 	    vmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
 #else
 	buf_hash_table.ht_table =
 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
 #endif
 	if (buf_hash_table.ht_table == NULL) {
 		ASSERT(hsize > (1ULL << 8));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
 	    0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, KMC_RECLAIMABLE);
 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL,
 	    NULL, NULL, 0);
 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < 256; i++)
 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
 
 	for (i = 0; i < BUF_LOCKS; i++)
 		mutex_init(BUF_HASH_LOCK(i), NULL, MUTEX_DEFAULT, NULL);
 }
 
 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
 /*
  * This is the size that the buf occupies in memory. If the buf is compressed,
  * it will correspond to the compressed size. You should use this method of
  * getting the buf size unless you explicitly need the logical size.
  */
 uint64_t
 arc_buf_size(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
 }
 
 uint64_t
 arc_buf_lsize(arc_buf_t *buf)
 {
 	return (HDR_GET_LSIZE(buf->b_hdr));
 }
 
 /*
  * This function will return B_TRUE if the buffer is encrypted in memory.
  * This buffer can be decrypted by calling arc_untransform().
  */
 boolean_t
 arc_is_encrypted(arc_buf_t *buf)
 {
 	return (ARC_BUF_ENCRYPTED(buf) != 0);
 }
 
 /*
  * Returns B_TRUE if the buffer represents data that has not had its MAC
  * verified yet.
  */
 boolean_t
 arc_is_unauthenticated(arc_buf_t *buf)
 {
 	return (HDR_NOAUTH(buf->b_hdr) != 0);
 }
 
 void
 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
     uint8_t *iv, uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	memcpy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
 	memcpy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
 	memcpy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 }
 
 /*
  * Indicates how this buffer is compressed in memory. If it is not compressed
  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
  * arc_untransform() as long as it is also unencrypted.
  */
 enum zio_compress
 arc_get_compression(arc_buf_t *buf)
 {
 	return (ARC_BUF_COMPRESSED(buf) ?
 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
 }
 
 /*
  * Return the compression algorithm used to store this data in the ARC. If ARC
  * compression is enabled or this is an encrypted block, this will be the same
  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
  */
 static inline enum zio_compress
 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
 {
 	return (HDR_COMPRESSION_ENABLED(hdr) ?
 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
 }
 
 uint8_t
 arc_get_complevel(arc_buf_t *buf)
 {
 	return (buf->b_hdr->b_complevel);
 }
 
 static inline boolean_t
 arc_buf_is_shared(arc_buf_t *buf)
 {
 	boolean_t shared = (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
 	EQUIV(shared, ARC_BUF_SHARED(buf));
 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
 
 	/*
 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
 	 * already being shared" requirement prevents us from doing that.
 	 */
 
 	return (shared);
 }
 
 /*
  * Free the checksum associated with this header. If there is no checksum, this
  * is a no-op.
  */
 static inline void
 arc_cksum_free(arc_buf_hdr_t *hdr)
 {
 #ifdef ZFS_DEBUG
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
 		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * Return true iff at least one of the bufs on hdr is not compressed.
  * Encrypted buffers count as compressed.
  */
 static boolean_t
 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
 {
 	ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr));
 
 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
 		if (!ARC_BUF_COMPRESSED(b)) {
 			return (B_TRUE);
 		}
 	}
 	return (B_FALSE);
 }
 
 
 /*
  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
  * matches the checksum that is stored in the hdr. If there is no checksum,
  * or if the buf is compressed, this is a no-op.
  */
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 
 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 }
 
 /*
  * This function makes the assumption that data stored in the L2ARC
  * will be transformed exactly as it is in the main pool. Because of
  * this we can verify the checksum against the reading process's bp.
  */
 static boolean_t
 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
 
 	/*
 	 * Block pointers always store the checksum for the logical data.
 	 * If the block pointer has the gang bit set, then the checksum
 	 * it represents is for the reconstituted data and not for an
 	 * individual gang member. The zio pipeline, however, must be able to
 	 * determine the checksum of each of the gang constituents so it
 	 * treats the checksum comparison differently than what we need
 	 * for l2arc blocks. This prevents us from using the
 	 * zio_checksum_error() interface directly. Instead we must call the
 	 * zio_checksum_error_impl() so that we can ensure the checksum is
 	 * generated using the correct checksum algorithm and accounts for the
 	 * logical I/O size and not just a gang fragment.
 	 */
 	return (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
 	    zio->io_offset, NULL) == 0);
 }
 
 /*
  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
  * isn't modified later on. If buf is compressed or there is already a checksum
  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
  */
 static void
 arc_cksum_compute(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 #ifdef ZFS_DEBUG
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
 
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 #endif
 	arc_buf_watch(buf);
 }
 
 #ifndef _KERNEL
 void
 arc_buf_sigsegv(int sig, siginfo_t *si, void *unused)
 {
 	(void) sig, (void) unused;
 	panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr);
 }
 #endif
 
 static void
 arc_buf_unwatch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch) {
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ | PROT_WRITE));
 	}
 #else
 	(void) buf;
 #endif
 }
 
 static void
 arc_buf_watch(arc_buf_t *buf)
 {
 #ifndef _KERNEL
 	if (arc_watch)
 		ASSERT0(mprotect(buf->b_data, arc_buf_size(buf),
 		    PROT_READ));
 #else
 	(void) buf;
 #endif
 }
 
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
 	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
 		type = ARC_BUFC_METADATA;
 	} else {
 		type = ARC_BUFC_DATA;
 	}
 	VERIFY3U(hdr->b_type, ==, type);
 	return (type);
 }
 
 boolean_t
 arc_is_metadata(arc_buf_t *buf)
 {
 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
 }
 
 static uint32_t
 arc_bufc_to_flags(arc_buf_contents_t type)
 {
 	switch (type) {
 	case ARC_BUFC_DATA:
 		/* metadata field is 0 if buffer contains normal data */
 		return (0);
 	case ARC_BUFC_METADATA:
 		return (ARC_FLAG_BUFC_METADATA);
 	default:
 		break;
 	}
 	panic("undefined ARC buffer type!");
 	return ((uint32_t)-1);
 }
 
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 	arc_cksum_verify(buf);
 
 	/*
 	 * Compressed buffers do not manipulate the b_freeze_cksum.
 	 */
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	arc_cksum_free(hdr);
 	arc_buf_unwatch(buf);
 }
 
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
 	if (ARC_BUF_COMPRESSED(buf))
 		return;
 
 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
 	arc_cksum_compute(buf);
 }
 
 /*
  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
  * the following functions should be used to ensure that the flags are
  * updated in a thread-safe way. When manipulating the flags either
  * the hash_lock must be held or the hdr must be undiscoverable. This
  * ensures that we're not racing with any other threads when updating
  * the flags.
  */
 static inline void
 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags |= flags;
 }
 
 static inline void
 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	hdr->b_flags &= ~flags;
 }
 
 /*
  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
  * done in a special way since we have to clear and set bits
  * at the same time. Consumers that wish to set the compression bits
  * must use this function to ensure that the flags are updated in
  * thread-safe manner.
  */
 static void
 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
 {
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Holes and embedded blocks will always have a psize = 0 so
 	 * we ignore the compression of the blkptr and set the
 	 * want to uncompress them. Mark them as uncompressed.
 	 */
 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
 	}
 
 	HDR_SET_COMPRESS(hdr, cmp);
 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
 }
 
 /*
  * Looks for another buf on the same hdr which has the data decompressed, copies
  * from it, and returns true. If no such buf exists, returns false.
  */
 static boolean_t
 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t copied = B_FALSE;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 
 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
 	    from = from->b_next) {
 		/* can't use our own data buffer */
 		if (from == buf) {
 			continue;
 		}
 
 		if (!ARC_BUF_COMPRESSED(from)) {
 			memcpy(buf->b_data, from->b_data, arc_buf_size(buf));
 			copied = B_TRUE;
 			break;
 		}
 	}
 
 #ifdef ZFS_DEBUG
 	/*
 	 * There were no decompressed bufs, so there should not be a
 	 * checksum on the hdr either.
 	 */
 	if (zfs_flags & ZFS_DEBUG_MODIFY)
 		EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
 #endif
 
 	return (copied);
 }
 
 /*
  * Allocates an ARC buf header that's in an evicted & L2-cached state.
  * This is used during l2arc reconstruction to make empty ARC buffers
  * which circumvent the regular disk->arc->l2arc path and instead come
  * into being in the reverse order, i.e. l2arc->arc.
  */
 static arc_buf_hdr_t *
 arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev,
     dva_t dva, uint64_t daddr, int32_t psize, uint64_t asize, uint64_t birth,
     enum zio_compress compress, uint8_t complevel, boolean_t protected,
     boolean_t prefetch, arc_state_type_t arcs_state)
 {
 	arc_buf_hdr_t	*hdr;
 
 	ASSERT(size != 0);
 	ASSERT(dev->l2ad_vdev != NULL);
 
 	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP);
 	hdr->b_birth = birth;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR);
 	HDR_SET_LSIZE(hdr, size);
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_L2SIZE(hdr, asize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	if (prefetch)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 	hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa);
 
 	hdr->b_dva = dva;
 
 	hdr->b_l2hdr.b_dev = dev;
 	hdr->b_l2hdr.b_daddr = daddr;
 	hdr->b_l2hdr.b_arcs_state = arcs_state;
 
 	return (hdr);
 }
 
 /*
  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
  */
 static uint64_t
 arc_hdr_size(arc_buf_hdr_t *hdr)
 {
 	uint64_t size;
 
 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 	    HDR_GET_PSIZE(hdr) > 0) {
 		size = HDR_GET_PSIZE(hdr);
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
 		size = HDR_GET_LSIZE(hdr);
 	}
 	return (size);
 }
 
 static int
 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
 {
 	int ret;
 	uint64_t csize;
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	abd_t *abd = hdr->b_l1hdr.b_pabd;
 	boolean_t free_abd = B_FALSE;
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_AUTHENTICATED(hdr));
 	ASSERT3P(abd, !=, NULL);
 
 	/*
 	 * The MAC is calculated on the compressed data that is stored on disk.
 	 * However, if compressed arc is disabled we will only have the
 	 * decompressed data available to us now. Compress it into a temporary
 	 * abd so we can verify the MAC. The performance overhead of this will
 	 * be relatively low, since most objects in an encrypted objset will
 	 * be encrypted (instead of authenticated) anyway.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd = NULL;
 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, &abd, lsize, MIN(lsize, psize),
 		    hdr->b_complevel);
 		if (csize >= lsize || csize > psize) {
 			ret = SET_ERROR(EIO);
 			return (ret);
 		}
 		ASSERT3P(abd, !=, NULL);
 		abd_zero_off(abd, csize, psize - csize);
 		free_abd = B_TRUE;
 	}
 
 	/*
 	 * Authentication is best effort. We authenticate whenever the key is
 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
 	 */
 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
 		ASSERT3U(lsize, ==, psize);
 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	} else {
 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
 		    hdr->b_crypt_hdr.b_mac);
 	}
 
 	if (ret == 0)
 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
 	else if (ret == ENOENT)
 		ret = 0;
 
 	if (free_abd)
 		abd_free(abd);
 
 	return (ret);
 }
 
 /*
  * This function will take a header that only has raw encrypted data in
  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
  * b_l1hdr.b_pabd. If designated in the header flags, this function will
  * also decompress the data.
  */
 static int
 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
 {
 	int ret;
 	abd_t *cabd = NULL;
 	boolean_t no_crypt = B_FALSE;
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT(HDR_ENCRYPTED(hdr));
 
 	arc_hdr_alloc_abd(hdr, 0);
 
 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
 	if (ret != 0)
 		goto error;
 
 	if (no_crypt) {
 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 	}
 
 	/*
 	 * If this header has disabled arc compression but the b_pabd is
 	 * compressed after decrypting it, we need to decompress the newly
 	 * decrypted data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		/*
 		 * We want to make sure that we are correctly honoring the
 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
 		 * and then loan a buffer from it, rather than allocating a
 		 * linear buffer and wrapping it in an abd later.
 		 */
 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, 0);
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			goto error;
 		}
 
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 	}
 
 	return (0);
 
 error:
 	arc_hdr_free_abd(hdr, B_FALSE);
 	if (cabd != NULL)
 		arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 
 	return (ret);
 }
 
 /*
  * This function is called during arc_buf_fill() to prepare the header's
  * abd plaintext pointer for use. This involves authenticated protected
  * data and decrypting encrypted data into the plaintext abd.
  */
 static int
 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
     const zbookmark_phys_t *zb, boolean_t noauth)
 {
 	int ret;
 
 	ASSERT(HDR_PROTECTED(hdr));
 
 	if (hash_lock != NULL)
 		mutex_enter(hash_lock);
 
 	if (HDR_NOAUTH(hdr) && !noauth) {
 		/*
 		 * The caller requested authenticated data but our data has
 		 * not been authenticated yet. Verify the MAC now if we can.
 		 */
 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
 		if (ret != 0)
 			goto error;
 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
 		/*
 		 * If we only have the encrypted version of the data, but the
 		 * unencrypted version was requested we take this opportunity
 		 * to store the decrypted version in the header for future use.
 		 */
 		ret = arc_hdr_decrypt(hdr, spa, zb);
 		if (ret != 0)
 			goto error;
 	}
 
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (0);
 
 error:
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	return (ret);
 }
 
 /*
  * This function is used by the dbuf code to decrypt bonus buffers in place.
  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
  * block, so we use the hash lock here to protect against concurrent calls to
  * arc_buf_fill().
  */
 static void
 arc_buf_untransform_in_place(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(HDR_ENCRYPTED(hdr));
 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 	ASSERT3PF(hdr->b_l1hdr.b_pabd, !=, NULL, "hdr %px buf %px", hdr, buf);
 
 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
 	    arc_buf_size(buf));
 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 }
 
 /*
  * Given a buf that has a data buffer attached to it, this function will
  * efficiently fill the buf with data of the specified compression setting from
  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
  * are already sharing a data buf, no copy is performed.
  *
  * If the buf is marked as compressed but uncompressed data was requested, this
  * will allocate a new data buffer for the buf, remove that flag, and fill the
  * buf with uncompressed data. You can't request a compressed buf on a hdr with
  * uncompressed data, and (since we haven't added support for it yet) if you
  * want compressed data your buf must already be marked as compressed and have
  * the correct-sized data buffer.
  */
 static int
 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     arc_fill_flags_t flags)
 {
 	int error = 0;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	boolean_t hdr_compressed =
 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
 	IMPLY(encrypted, !arc_buf_is_shared(buf));
 
 	/*
 	 * If the caller wanted encrypted data we just need to copy it from
 	 * b_rabd and potentially byteswap it. We won't be able to do any
 	 * further transforms on it.
 	 */
 	if (encrypted) {
 		ASSERT(HDR_HAS_RABD(hdr));
 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
 		    HDR_GET_PSIZE(hdr));
 		goto byteswap;
 	}
 
 	/*
 	 * Adjust encrypted and authenticated headers to accommodate
 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
 	 * allowed to fail decryption due to keys not being loaded
 	 * without being marked as an IO error.
 	 */
 	if (HDR_PROTECTED(hdr)) {
 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
 		    zb, !!(flags & ARC_FILL_NOAUTH));
 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
 			return (error);
 		} else if (error != 0) {
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			return (error);
 		}
 	}
 
 	/*
 	 * There is a special case here for dnode blocks which are
 	 * decrypting their bonus buffers. These blocks may request to
 	 * be decrypted in-place. This is necessary because there may
 	 * be many dnodes pointing into this buffer and there is
 	 * currently no method to synchronize replacing the backing
 	 * b_data buffer and updating all of the pointers. Here we use
 	 * the hash lock to ensure there are no races. If the need
 	 * arises for other types to be decrypted in-place, they must
 	 * add handling here as well.
 	 */
 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
 		ASSERT(!hdr_compressed);
 		ASSERT(!compressed);
 		ASSERT(!encrypted);
 
 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
 
 			if (hash_lock != NULL)
 				mutex_enter(hash_lock);
 			arc_buf_untransform_in_place(buf);
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 
 			/* Compute the hdr's checksum if necessary */
 			arc_cksum_compute(buf);
 		}
 
 		return (0);
 	}
 
 	if (hdr_compressed == compressed) {
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(arc_buf_is_shared(buf));
 		} else {
 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT(hdr_compressed);
 		ASSERT(!compressed);
 
 		/*
 		 * If the buf is sharing its data with the hdr, unlink it and
 		 * allocate a new data buffer for the buf.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERTF(ARC_BUF_COMPRESSED(buf),
 			"buf %p was uncompressed", buf);
 
 			/* We need to give the buf its own b_data */
 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 
 			/* Previously overhead was 0; just add new overhead */
 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
 		} else if (ARC_BUF_COMPRESSED(buf)) {
 			ASSERT(!arc_buf_is_shared(buf));
 
 			/* We need to reallocate the buf's b_data */
 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
 			    buf);
 			buf->b_data =
 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
 
 			/* We increased the size of b_data; update overhead */
 			ARCSTAT_INCR(arcstat_overhead_size,
 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
 		}
 
 		/*
 		 * Regardless of the buf's previous compression settings, it
 		 * should not be compressed at the end of this function.
 		 */
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 
 		/*
 		 * Try copying the data from another buf which already has a
 		 * decompressed version. If that's not possible, it's time to
 		 * bite the bullet and decompress the data from the hdr.
 		 */
 		if (arc_buf_try_copy_decompressed_data(buf)) {
 			/* Skip byteswapping and checksumming (already done) */
 			return (0);
 		} else {
 			abd_t dabd;
 			abd_get_from_buf_struct(&dabd, buf->b_data,
 			    HDR_GET_LSIZE(hdr));
 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 			    hdr->b_l1hdr.b_pabd, &dabd,
 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr),
 			    &hdr->b_complevel);
 			abd_free(&dabd);
 
 			/*
 			 * Absent hardware errors or software bugs, this should
 			 * be impossible, but log it anyway so we can debug it.
 			 */
 			if (error != 0) {
 				zfs_dbgmsg(
 				    "hdr %px, compress %d, psize %d, lsize %d",
 				    hdr, arc_hdr_get_compress(hdr),
 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 				return (SET_ERROR(EIO));
 			}
 		}
 	}
 
 byteswap:
 	/* Byteswap the buf's data if necessary */
 	if (bswap != DMU_BSWAP_NUMFUNCS) {
 		ASSERT(!HDR_SHARED_DATA(hdr));
 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
 	}
 
 	/* Compute the hdr's checksum if necessary */
 	arc_cksum_compute(buf);
 
 	return (0);
 }
 
 /*
  * If this function is being called to decrypt an encrypted buffer or verify an
  * authenticated one, the key must be loaded and a mapping must be made
  * available in the keystore via spa_keystore_create_mapping() or one of its
  * callers.
  */
 int
 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
     boolean_t in_place)
 {
 	int ret;
 	arc_fill_flags_t flags = 0;
 
 	if (in_place)
 		flags |= ARC_FILL_IN_PLACE;
 
 	ret = arc_buf_fill(buf, spa, zb, flags);
 	if (ret == ECKSUM) {
 		/*
 		 * Convert authentication and decryption errors to EIO
 		 * (and generate an ereport) before leaving the ARC.
 		 */
 		ret = SET_ERROR(EIO);
 		spa_log_error(spa, zb, buf->b_hdr->b_birth);
 		(void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
 		    spa, NULL, zb, NULL, 0);
 	}
 
 	return (ret);
 }
 
 /*
  * Increment the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT0P(hdr->b_l1hdr.b_buf);
 		ASSERT0P(hdr->b_l1hdr.b_pabd);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Decrement the amount of evictable space in the arc_state_t's refcount.
  * We account for the space used by the hdr and the arc buf individually
  * so that we can add and remove them from the refcount individually.
  */
 static void
 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	if (GHOST_STATE(state)) {
 		ASSERT0P(hdr->b_l1hdr.b_buf);
 		ASSERT0P(hdr->b_l1hdr.b_pabd);
 		ASSERT(!HDR_HAS_RABD(hdr));
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_LSIZE(hdr), hdr);
 		return;
 	}
 
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_hdr_size(hdr), hdr);
 	}
 	if (HDR_HAS_RABD(hdr)) {
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    HDR_GET_PSIZE(hdr), hdr);
 	}
 
 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 	    buf = buf->b_next) {
 		if (ARC_BUF_SHARED(buf))
 			continue;
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    arc_buf_size(buf), buf);
 	}
 }
 
 /*
  * Add a reference to this hdr indicating that someone is actively
  * referencing that memory. When the refcount transitions from 0 to 1,
  * we remove it from the respective arc_state_t list to indicate that
  * it is not evictable.
  */
 static void
 add_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) {
 		ASSERT(state == arc_anon);
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT0P(hdr->b_l1hdr.b_buf);
 	}
 
 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
 	    state != arc_anon && state != arc_l2c_only) {
 		/* We don't use the L2-only state list. */
 		multilist_remove(&state->arcs_list[arc_buf_type(hdr)], hdr);
 		arc_evictable_space_decrement(hdr, state);
 	}
 }
 
 /*
  * Remove a reference from this hdr. When the reference transitions from
  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
  * list making it eligible for eviction.
  */
 static int
 remove_reference(arc_buf_hdr_t *hdr, const void *tag)
 {
 	int cnt;
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(state == arc_anon || MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(!GHOST_STATE(state));	/* arc_l2c_only counts as a ghost. */
 
 	if ((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) != 0)
 		return (cnt);
 
 	if (state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	if (state == arc_uncached && !HDR_PREFETCH(hdr)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		return (0);
 	}
 	multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
 	arc_evictable_space_increment(hdr, state);
 	return (0);
 }
 
 /*
  * Returns detailed information about a specific arc buffer.  When the
  * state_index argument is set the function will calculate the arc header
  * list position for its arc state.  Since this requires a linear traversal
  * callers are strongly encourage not to do this.  However, it can be helpful
  * for targeted analysis so the functionality is provided.
  */
 void
 arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index)
 {
 	(void) state_index;
 	arc_buf_hdr_t *hdr = ab->b_hdr;
 	l1arc_buf_hdr_t *l1hdr = NULL;
 	l2arc_buf_hdr_t *l2hdr = NULL;
 	arc_state_t *state = NULL;
 
 	memset(abi, 0, sizeof (arc_buf_info_t));
 
 	if (hdr == NULL)
 		return;
 
 	abi->abi_flags = hdr->b_flags;
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		l1hdr = &hdr->b_l1hdr;
 		state = l1hdr->b_state;
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		l2hdr = &hdr->b_l2hdr;
 
 	if (l1hdr) {
 		abi->abi_bufcnt = 0;
 		for (arc_buf_t *buf = l1hdr->b_buf; buf; buf = buf->b_next)
 			abi->abi_bufcnt++;
 		abi->abi_access = l1hdr->b_arc_access;
 		abi->abi_mru_hits = l1hdr->b_mru_hits;
 		abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits;
 		abi->abi_mfu_hits = l1hdr->b_mfu_hits;
 		abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits;
 		abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt);
 	}
 
 	if (l2hdr) {
 		abi->abi_l2arc_dattr = l2hdr->b_daddr;
 		abi->abi_l2arc_hits = l2hdr->b_hits;
 	}
 
 	abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON;
 	abi->abi_state_contents = arc_buf_type(hdr);
 	abi->abi_size = arc_hdr_size(hdr);
 }
 
 /*
  * Move the supplied buffer to the indicated state. The hash lock
  * for the buffer must be held by the caller.
  */
 static void
 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
 {
 	arc_state_t *old_state;
 	int64_t refcnt;
 	boolean_t update_old, update_new;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/*
 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
 	 * L1 hdr doesn't always exist when we change state to arc_anon before
 	 * destroying a header, in which case reallocating to add the L1 hdr is
 	 * pointless.
 	 */
 	if (HDR_HAS_L1HDR(hdr)) {
 		old_state = hdr->b_l1hdr.b_state;
 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
 		update_old = (hdr->b_l1hdr.b_buf != NULL ||
 		    hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 		IMPLY(GHOST_STATE(old_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(GHOST_STATE(new_state), hdr->b_l1hdr.b_buf == NULL);
 		IMPLY(old_state == arc_anon, hdr->b_l1hdr.b_buf == NULL ||
 		    ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 	} else {
 		old_state = arc_l2c_only;
 		refcnt = 0;
 		update_old = B_FALSE;
 	}
 	update_new = update_old;
 	if (GHOST_STATE(old_state))
 		update_old = B_TRUE;
 	if (GHOST_STATE(new_state))
 		update_new = B_TRUE;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(new_state, !=, old_state);
 
 	/*
 	 * If this buffer is evictable, transfer it from the
 	 * old state list to the new state list.
 	 */
 	if (refcnt == 0) {
 		if (old_state != arc_anon && old_state != arc_l2c_only) {
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			/* remove_reference() saves on insert. */
 			if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 				multilist_remove(&old_state->arcs_list[type],
 				    hdr);
 				arc_evictable_space_decrement(hdr, old_state);
 			}
 		}
 		if (new_state != arc_anon && new_state != arc_l2c_only) {
 			/*
 			 * An L1 header always exists here, since if we're
 			 * moving to some L1-cached state (i.e. not l2c_only or
 			 * anonymous), we realloc the header to add an L1hdr
 			 * beforehand.
 			 */
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			multilist_insert(&new_state->arcs_list[type], hdr);
 			arc_evictable_space_increment(hdr, new_state);
 		}
 	}
 
 	ASSERT(!HDR_EMPTY(hdr));
 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
 		buf_hash_remove(hdr);
 
 	/* adjust state sizes (ignore arc_l2c_only) */
 
 	if (update_new && new_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(new_state)) {
 
 			/*
 			 * When moving a header to a ghost state, we first
 			 * remove all arc buffers. Thus, we'll have no arc
 			 * buffer to use for the reference. As a result, we
 			 * use the arc header pointer for the reference.
 			 */
 			(void) zfs_refcount_add_many(
 			    &new_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 			ASSERT0P(hdr->b_l1hdr.b_pabd);
 			ASSERT(!HDR_HAS_RABD(hdr));
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_add_many(
 				    &new_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (update_old && old_state != arc_l2c_only) {
 		ASSERT(HDR_HAS_L1HDR(hdr));
 		if (GHOST_STATE(old_state)) {
 			ASSERT0P(hdr->b_l1hdr.b_pabd);
 			ASSERT(!HDR_HAS_RABD(hdr));
 
 			/*
 			 * When moving a header off of a ghost state,
 			 * the header will not contain any arc buffers.
 			 * We use the arc header pointer for the reference
 			 * which is exactly what we did when we put the
 			 * header on the ghost state.
 			 */
 
 			(void) zfs_refcount_remove_many(
 			    &old_state->arcs_size[type],
 			    HDR_GET_LSIZE(hdr), hdr);
 		} else {
 
 			/*
 			 * Each individual buffer holds a unique reference,
 			 * thus we must remove each of these references one
 			 * at a time.
 			 */
 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
 			    buf = buf->b_next) {
 
 				/*
 				 * When the arc_buf_t is sharing the data
 				 * block with the hdr, the owner of the
 				 * reference belongs to the hdr. Only
 				 * add to the refcount if the arc_buf_t is
 				 * not shared.
 				 */
 				if (ARC_BUF_SHARED(buf))
 					continue;
 
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_buf_size(buf), buf);
 			}
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 
 			if (hdr->b_l1hdr.b_pabd != NULL) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    arc_hdr_size(hdr), hdr);
 			}
 
 			if (HDR_HAS_RABD(hdr)) {
 				(void) zfs_refcount_remove_many(
 				    &old_state->arcs_size[type],
 				    HDR_GET_PSIZE(hdr), hdr);
 			}
 		}
 	}
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		hdr->b_l1hdr.b_state = new_state;
 
 		if (HDR_HAS_L2HDR(hdr) && new_state != arc_l2c_only) {
 			l2arc_hdr_arcstats_decrement_state(hdr);
 			hdr->b_l2hdr.b_arcs_state = new_state->arcs_state;
 			l2arc_hdr_arcstats_increment_state(hdr);
 		}
 	}
 }
 
 void
 arc_space_consume(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, space);
 		break;
 	case ARC_SPACE_DNODE:
 		aggsum_add(&arc_sums.arcstat_dnode_size, space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		/*
 		 * Note: this includes space wasted by all scatter ABD's, not
 		 * just those allocated by the ARC.  But the vast majority of
 		 * scatter ABD's come from the ARC, because other users are
 		 * very short-lived.
 		 */
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, space);
 
 	aggsum_add(&arc_sums.arcstat_size, space);
 }
 
 void
 arc_space_return(uint64_t space, arc_space_type_t type)
 {
 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
 
 	switch (type) {
 	default:
 		break;
 	case ARC_SPACE_DATA:
 		ARCSTAT_INCR(arcstat_data_size, -space);
 		break;
 	case ARC_SPACE_META:
 		ARCSTAT_INCR(arcstat_metadata_size, -space);
 		break;
 	case ARC_SPACE_BONUS:
 		ARCSTAT_INCR(arcstat_bonus_size, -space);
 		break;
 	case ARC_SPACE_DNODE:
 		aggsum_add(&arc_sums.arcstat_dnode_size, -space);
 		break;
 	case ARC_SPACE_DBUF:
 		ARCSTAT_INCR(arcstat_dbuf_size, -space);
 		break;
 	case ARC_SPACE_HDRS:
 		ARCSTAT_INCR(arcstat_hdr_size, -space);
 		break;
 	case ARC_SPACE_L2HDRS:
 		aggsum_add(&arc_sums.arcstat_l2_hdr_size, -space);
 		break;
 	case ARC_SPACE_ABD_CHUNK_WASTE:
 		ARCSTAT_INCR(arcstat_abd_chunk_waste_size, -space);
 		break;
 	}
 
 	if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE)
 		ARCSTAT_INCR(arcstat_meta_used, -space);
 
 	ASSERT(aggsum_compare(&arc_sums.arcstat_size, space) >= 0);
 	aggsum_add(&arc_sums.arcstat_size, -space);
 }
 
 /*
  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
  * with the hdr's b_pabd.
  */
 static boolean_t
 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	/*
 	 * The criteria for sharing a hdr's data are:
 	 * 1. the buffer is not encrypted
 	 * 2. the hdr's compression matches the buf's compression
 	 * 3. the hdr doesn't need to be byteswapped
 	 * 4. the hdr isn't already being shared
 	 * 5. the buf is either compressed or it is the last buf in the hdr list
 	 *
 	 * Criterion #5 maintains the invariant that shared uncompressed
 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
 	 * might ask, "if a compressed buf is allocated first, won't that be the
 	 * last thing in the list?", but in that case it's impossible to create
 	 * a shared uncompressed buf anyway (because the hdr must be compressed
 	 * to have the compressed buf). You might also think that #3 is
 	 * sufficient to make this guarantee, however it's possible
 	 * (specifically in the rare L2ARC write race mentioned in
 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
 	 * is shareable, but wasn't at the time of its allocation. Rather than
 	 * allow a new shared uncompressed buf to be created and then shuffle
 	 * the list around to make it the last element, this simply disallows
 	 * sharing if the new buf isn't the first to be added.
 	 */
 	ASSERT3P(buf->b_hdr, ==, hdr);
 	boolean_t hdr_compressed =
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF;
 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
 	return (!ARC_BUF_ENCRYPTED(buf) &&
 	    buf_compressed == hdr_compressed &&
 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
 	    !HDR_SHARED_DATA(hdr) &&
 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
 }
 
 /*
  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
  * copy was made successfully, or an error code otherwise.
  */
 static int
 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
     const void *tag, boolean_t encrypted, boolean_t compressed,
     boolean_t noauth, boolean_t fill, arc_buf_t **ret)
 {
 	arc_buf_t *buf;
 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
 	    hdr->b_type == ARC_BUFC_METADATA);
 	ASSERT3P(ret, !=, NULL);
 	ASSERT0P(*ret);
 	IMPLY(encrypted, compressed);
 
 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
 	buf->b_next = hdr->b_l1hdr.b_buf;
 	buf->b_flags = 0;
 
 	add_reference(hdr, tag);
 
 	/*
 	 * We're about to change the hdr's b_flags. We must either
 	 * hold the hash_lock or be undiscoverable.
 	 */
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Only honor requests for compressed bufs if the hdr is actually
 	 * compressed. This must be overridden if the buffer is encrypted since
 	 * encrypted buffers cannot be decompressed.
 	 */
 	if (encrypted) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
 	} else if (compressed &&
 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
 		flags |= ARC_FILL_COMPRESSED;
 	}
 
 	if (noauth) {
 		ASSERT0(encrypted);
 		flags |= ARC_FILL_NOAUTH;
 	}
 
 	/*
 	 * If the hdr's data can be shared then we share the data buffer and
 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
 	 * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
 	 * buffer to store the buf's data.
 	 *
 	 * There are two additional restrictions here because we're sharing
 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
 	 * actively involved in an L2ARC write, because if this buf is used by
 	 * an arc_write() then the hdr's data buffer will be released when the
 	 * write completes, even though the L2ARC write might still be using it.
 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
 	 * need to be ABD-aware.  It must be allocated via
 	 * zio_[data_]buf_alloc(), not as a page, because we need to be able
 	 * to abd_release_ownership_of_buf(), which isn't allowed on "linear
 	 * page" buffers because the ABD code needs to handle freeing them
 	 * specially.
 	 */
 	boolean_t can_share = arc_can_share(hdr, buf) &&
 	    !HDR_L2_WRITING(hdr) &&
 	    hdr->b_l1hdr.b_pabd != NULL &&
 	    abd_is_linear(hdr->b_l1hdr.b_pabd) &&
 	    !abd_is_linear_page(hdr->b_l1hdr.b_pabd);
 
 	/* Set up b_data and sharing */
 	if (can_share) {
 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	} else {
 		buf->b_data =
 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 	}
 	VERIFY3P(buf->b_data, !=, NULL);
 
 	hdr->b_l1hdr.b_buf = buf;
 
 	/*
 	 * If the user wants the data from the hdr, we need to either copy or
 	 * decompress the data.
 	 */
 	if (fill) {
 		ASSERT3P(zb, !=, NULL);
 		return (arc_buf_fill(buf, spa, zb, flags));
 	}
 
 	return (0);
 }
 
 static const char *arc_onloan_tag = "onloan";
 
 static inline void
 arc_loaned_bytes_update(int64_t delta)
 {
 	atomic_add_64(&arc_loaned_bytes, delta);
 
 	/* assert that it did not wrap around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 }
 
 /*
  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
  * flight data by arc_tempreserve_space() until they are "returned". Loaned
  * buffers must be returned to the arc before they can be used by the DMU or
  * freed.
  */
 arc_buf_t *
 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
 {
 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
 	    psize, lsize, compression_type, complevel);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type,
 	    complevel);
 
 	atomic_add_64(&arc_loaned_bytes, psize);
 	return (buf);
 }
 
 
 /*
  * Return a loaned arc buffer to the arc.
  */
 void
 arc_return_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 
 	arc_loaned_bytes_update(-arc_buf_size(buf));
 }
 
 /* Detach an arc_buf from a dbuf (tag) */
 void
 arc_loan_inuse_buf(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT3P(buf->b_data, !=, NULL);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
 
 	arc_loaned_bytes_update(arc_buf_size(buf));
 }
 
 static void
 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
 {
 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
 
 	df->l2df_abd = abd;
 	df->l2df_size = size;
 	df->l2df_type = type;
 	mutex_enter(&l2arc_free_on_write_mtx);
 	list_insert_head(l2arc_free_on_write, df);
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 static void
 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, hdr);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 
 	if (free_rdata) {
 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
 	} else {
 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
 	}
 }
 
 /*
  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
  * data buffer, we transfer the refcount ownership to the hdr and update
  * the appropriate kstats.
  */
 static void
 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_can_share(hdr, buf));
 	ASSERT0P(hdr->b_l1hdr.b_pabd);
 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * Start sharing the data buffer. We transfer the
 	 * refcount ownership to the hdr since it always owns
 	 * the refcount whenever an arc_buf_t is shared.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), buf, hdr);
 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
 	    HDR_ISTYPE_METADATA(hdr));
 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since we've transferred ownership to the hdr we need
 	 * to increment its compressed and uncompressed kstats and
 	 * decrement the overhead size.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
 }
 
 static void
 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(arc_buf_is_shared(buf));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	/*
 	 * We are no longer sharing this buffer so we need
 	 * to transfer its ownership to the rightful owner.
 	 */
 	zfs_refcount_transfer_ownership_many(
 	    &hdr->b_l1hdr.b_state->arcs_size[arc_buf_type(hdr)],
 	    arc_hdr_size(hdr), hdr, buf);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
 	abd_free(hdr->b_l1hdr.b_pabd);
 	hdr->b_l1hdr.b_pabd = NULL;
 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
 
 	/*
 	 * Since the buffer is no longer shared between
 	 * the arc buf and the hdr, count it as overhead.
 	 */
 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
 }
 
 /*
  * Remove an arc_buf_t from the hdr's buf list and return the last
  * arc_buf_t on the list. If no buffers remain on the list then return
  * NULL.
  */
 static arc_buf_t *
 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
 {
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
 	arc_buf_t *lastbuf = NULL;
 
 	/*
 	 * Remove the buf from the hdr list and locate the last
 	 * remaining buffer on the list.
 	 */
 	while (*bufp != NULL) {
 		if (*bufp == buf)
 			*bufp = buf->b_next;
 
 		/*
 		 * If we've removed a buffer in the middle of
 		 * the list then update the lastbuf and update
 		 * bufp.
 		 */
 		if (*bufp != NULL) {
 			lastbuf = *bufp;
 			bufp = &(*bufp)->b_next;
 		}
 	}
 	buf->b_next = NULL;
 	ASSERT3P(lastbuf, !=, buf);
 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
 
 	return (lastbuf);
 }
 
 /*
  * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's
  * list and free it.
  */
 static void
 arc_buf_destroy_impl(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Free up the data associated with the buf but only if we're not
 	 * sharing this with the hdr. If we are sharing it with the hdr, the
 	 * hdr is responsible for doing the free.
 	 */
 	if (buf->b_data != NULL) {
 		/*
 		 * We're about to change the hdr's b_flags. We must either
 		 * hold the hash_lock or be undiscoverable.
 		 */
 		ASSERT(HDR_EMPTY_OR_LOCKED(hdr));
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		if (ARC_BUF_SHARED(buf)) {
 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			uint64_t size = arc_buf_size(buf);
 			arc_free_data_buf(hdr, buf->b_data, size, buf);
 			ARCSTAT_INCR(arcstat_overhead_size, -size);
 		}
 		buf->b_data = NULL;
 
 		/*
 		 * If we have no more encrypted buffers and we've already
 		 * gotten a copy of the decrypted data we can free b_rabd
 		 * to save some space.
 		 */
 		if (ARC_BUF_ENCRYPTED(buf) && HDR_HAS_RABD(hdr) &&
 		    hdr->b_l1hdr.b_pabd != NULL && !HDR_IO_IN_PROGRESS(hdr)) {
 			arc_buf_t *b;
 			for (b = hdr->b_l1hdr.b_buf; b; b = b->b_next) {
 				if (b != buf && ARC_BUF_ENCRYPTED(b))
 					break;
 			}
 			if (b == NULL)
 				arc_hdr_free_abd(hdr, B_TRUE);
 		}
 	}
 
 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 
 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 		/*
 		 * If the current arc_buf_t is sharing its data buffer with the
 		 * hdr, then reassign the hdr's b_pabd to share it with the new
 		 * buffer at the end of the list. The shared buffer is always
 		 * the last one on the hdr's buffer list.
 		 *
 		 * There is an equivalent case for compressed bufs, but since
 		 * they aren't guaranteed to be the last buf in the list and
 		 * that is an exceedingly rare case, we just allow that space be
 		 * wasted temporarily. We must also be careful not to share
 		 * encrypted buffers, since they cannot be shared.
 		 */
 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
 			/* Only one buf can be shared at once */
 			ASSERT(!arc_buf_is_shared(lastbuf));
 			/* hdr is uncompressed so can't have compressed buf */
 			ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
 
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 			/*
 			 * We must setup a new shared block between the
 			 * last buffer and the hdr. The data would have
 			 * been allocated by the arc buf so we need to transfer
 			 * ownership to the hdr since it's now being shared.
 			 */
 			arc_share_buf(hdr, lastbuf);
 		}
 	} else if (HDR_SHARED_DATA(hdr)) {
 		/*
 		 * Uncompressed shared buffers are always at the end
 		 * of the list. Compressed buffers don't have the
 		 * same requirements. This makes it hard to
 		 * simply assert that the lastbuf is shared so
 		 * we rely on the hdr's compression flags to determine
 		 * if we have a compressed, shared buffer.
 		 */
 		ASSERT3P(lastbuf, !=, NULL);
 		ASSERT(arc_buf_is_shared(lastbuf) ||
 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 	}
 
 	/*
 	 * Free the checksum if we're removing the last uncompressed buf from
 	 * this hdr.
 	 */
 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
 		arc_cksum_free(hdr);
 	}
 
 	/* clean up the buf */
 	buf->b_hdr = NULL;
 	kmem_cache_free(buf_cache, buf);
 }
 
 static void
 arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags)
 {
 	uint64_t size;
 	boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0);
 
 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
 
 	if (alloc_rdata) {
 		size = HDR_GET_PSIZE(hdr);
 		ASSERT0P(hdr->b_crypt_hdr.b_rabd);
 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
 		ARCSTAT_INCR(arcstat_raw_size, size);
 	} else {
 		size = arc_hdr_size(hdr);
 		ASSERT0P(hdr->b_l1hdr.b_pabd);
 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr,
 		    alloc_flags);
 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 	}
 
 	ARCSTAT_INCR(arcstat_compressed_size, size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
 }
 
 static void
 arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
 {
 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
 
 	/*
 	 * If the hdr is currently being written to the l2arc then
 	 * we defer freeing the data by adding it to the l2arc_free_on_write
 	 * list. The l2arc will free the data once it's finished
 	 * writing it to the l2arc device.
 	 */
 	if (HDR_L2_WRITING(hdr)) {
 		arc_hdr_free_on_write(hdr, free_rdata);
 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
 	} else if (free_rdata) {
 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
 	} else {
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr);
 	}
 
 	if (free_rdata) {
 		hdr->b_crypt_hdr.b_rabd = NULL;
 		ARCSTAT_INCR(arcstat_raw_size, -size);
 	} else {
 		hdr->b_l1hdr.b_pabd = NULL;
 	}
 
 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 
 	ARCSTAT_INCR(arcstat_compressed_size, -size);
 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
 }
 
 /*
  * Allocate empty anonymous ARC header.  The header will get its identity
  * assigned and buffers attached later as part of read or write operations.
  *
  * In case of read arc_read() assigns header its identify (b_dva + b_birth),
  * inserts it into ARC hash to become globally visible and allocates physical
  * (b_pabd) or raw (b_rabd) ABD buffer to read into from disk.  On disk read
  * completion arc_read_done() allocates ARC buffer(s) as needed, potentially
  * sharing one of them with the physical ABD buffer.
  *
  * In case of write arc_alloc_buf() allocates ARC buffer to be filled with
  * data.  Then after compression and/or encryption arc_write_ready() allocates
  * and fills (or potentially shares) physical (b_pabd) or raw (b_rabd) ABD
  * buffer.  On disk write completion arc_write_done() assigns the header its
  * new identity (b_dva + b_birth) and inserts into ARC hash.
  *
  * In case of partial overwrite the old data is read first as described. Then
  * arc_release() either allocates new anonymous ARC header and moves the ARC
  * buffer to it, or reuses the old ARC header by discarding its identity and
  * removing it from ARC hash.  After buffer modification normal write process
  * follows as described.
  */
 static arc_buf_hdr_t *
 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
     boolean_t protected, enum zio_compress compression_type, uint8_t complevel,
     arc_buf_contents_t type)
 {
 	arc_buf_hdr_t *hdr;
 
 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
 	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
 
 	ASSERT(HDR_EMPTY(hdr));
 #ifdef ZFS_DEBUG
 	ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
 #endif
 	HDR_SET_PSIZE(hdr, psize);
 	HDR_SET_LSIZE(hdr, lsize);
 	hdr->b_spa = spa;
 	hdr->b_type = type;
 	hdr->b_flags = 0;
 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
 	arc_hdr_set_compress(hdr, compression_type);
 	hdr->b_complevel = complevel;
 	if (protected)
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 
 	hdr->b_l1hdr.b_state = arc_anon;
 	hdr->b_l1hdr.b_arc_access = 0;
 	hdr->b_l1hdr.b_mru_hits = 0;
 	hdr->b_l1hdr.b_mru_ghost_hits = 0;
 	hdr->b_l1hdr.b_mfu_hits = 0;
 	hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 	hdr->b_l1hdr.b_buf = NULL;
 
 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 
 	return (hdr);
 }
 
 /*
  * Transition between the two allocation states for the arc_buf_hdr struct.
  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
  * version is used when a cache buffer is only in the L2ARC in order to reduce
  * memory usage.
  */
 static arc_buf_hdr_t *
 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
 {
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	arc_buf_hdr_t *nhdr;
 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 
 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
 	    (old == hdr_l2only_cache && new == hdr_full_cache));
 
 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	buf_hash_remove(hdr);
 
 	memcpy(nhdr, hdr, HDR_L2ONLY_SIZE);
 
 	if (new == hdr_full_cache) {
 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 		/*
 		 * arc_access and arc_change_state need to be aware that a
 		 * header has just come out of L2ARC, so we set its state to
 		 * l2c_only even though it's about to change.
 		 */
 		nhdr->b_l1hdr.b_state = arc_l2c_only;
 
 		/* Verify previous threads set to NULL before freeing */
 		ASSERT0P(nhdr->b_l1hdr.b_pabd);
 		ASSERT(!HDR_HAS_RABD(hdr));
 	} else {
 		ASSERT0P(hdr->b_l1hdr.b_buf);
 #ifdef ZFS_DEBUG
 		ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
 #endif
 
 		/*
 		 * If we've reached here, We must have been called from
 		 * arc_evict_hdr(), as such we should have already been
 		 * removed from any ghost list we were previously on
 		 * (which protects us from racing with arc_evict_state),
 		 * thus no locking is needed during this check.
 		 */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		/*
 		 * A buffer must not be moved into the arc_l2c_only
 		 * state if it's not finished being written out to the
 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
 		 * might try to be accessed, even though it was removed.
 		 */
 		VERIFY(!HDR_L2_WRITING(hdr));
 		VERIFY0P(hdr->b_l1hdr.b_pabd);
 		ASSERT(!HDR_HAS_RABD(hdr));
 
 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
 	}
 	/*
 	 * The header has been reallocated so we need to re-insert it into any
 	 * lists it was on.
 	 */
 	(void) buf_hash_insert(nhdr, NULL);
 
 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
 
 	mutex_enter(&dev->l2ad_mtx);
 
 	/*
 	 * We must place the realloc'ed header back into the list at
 	 * the same spot. Otherwise, if it's placed earlier in the list,
 	 * l2arc_write_buffers() could find it during the function's
 	 * write phase, and try to write it out to the l2arc.
 	 */
 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	mutex_exit(&dev->l2ad_mtx);
 
 	/*
 	 * Since we're using the pointer address as the tag when
 	 * incrementing and decrementing the l2ad_alloc refcount, we
 	 * must remove the old pointer (that we're about to destroy) and
 	 * add the new pointer to the refcount. Otherwise we'd remove
 	 * the wrong pointer address when calling arc_hdr_destroy() later.
 	 */
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 	    arc_hdr_size(hdr), hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 	    arc_hdr_size(nhdr), nhdr);
 
 	buf_discard_identity(hdr);
 	kmem_cache_free(old, hdr);
 
 	return (nhdr);
 }
 
 /*
  * This function is used by the send / receive code to convert a newly
  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
  * is also used to allow the root objset block to be updated without altering
  * its embedded MACs. Both block types will always be uncompressed so we do not
  * have to worry about compression type or psize.
  */
 void
 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 
 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
 	arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	if (!arc_hdr_has_uncompressed_buf(hdr))
 		arc_cksum_free(hdr);
 
 	if (salt != NULL)
 		memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	if (iv != NULL)
 		memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	if (mac != NULL)
 		memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 /*
  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
  * The buf is returned thawed since we expect the consumer to modify it.
  */
 arc_buf_t *
 arc_alloc_buf(spa_t *spa, const void *tag, arc_buf_contents_t type,
     int32_t size)
 {
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
 	    B_FALSE, ZIO_COMPRESS_OFF, 0, type);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 /*
  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
  * for bufs containing metadata.
  */
 arc_buf_t *
 arc_alloc_compressed_buf(spa_t *spa, const void *tag, uint64_t psize,
     uint64_t lsize, enum zio_compress compression_type, uint8_t complevel)
 {
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
 	    B_FALSE, compression_type, complevel, ARC_BUFC_DATA);
 
 	arc_buf_t *buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
 	    B_TRUE, B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	/*
 	 * To ensure that the hdr has the correct data in it if we call
 	 * arc_untransform() on this buf before it's been written to disk,
 	 * it's easiest if we just set up sharing between the buf and the hdr.
 	 */
 	arc_share_buf(hdr, buf);
 
 	return (buf);
 }
 
 arc_buf_t *
 arc_alloc_raw_buf(spa_t *spa, const void *tag, uint64_t dsobj,
     boolean_t byteorder, const uint8_t *salt, const uint8_t *iv,
     const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
     enum zio_compress compression_type, uint8_t complevel)
 {
 	arc_buf_hdr_t *hdr;
 	arc_buf_t *buf;
 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
 
 	ASSERT3U(lsize, >, 0);
 	ASSERT3U(lsize, >=, psize);
 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
 
 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
 	    compression_type, complevel, type);
 
 	hdr->b_crypt_hdr.b_dsobj = dsobj;
 	hdr->b_crypt_hdr.b_ot = ot;
 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
 	memcpy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
 	memcpy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
 	memcpy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
 
 	/*
 	 * This buffer will be considered encrypted even if the ot is not an
 	 * encrypted type. It will become authenticated instead in
 	 * arc_write_ready().
 	 */
 	buf = NULL;
 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
 	    B_FALSE, B_FALSE, &buf));
 	arc_buf_thaw(buf);
 
 	return (buf);
 }
 
 static void
 l2arc_hdr_arcstats_update(arc_buf_hdr_t *hdr, boolean_t incr,
     boolean_t state_only)
 {
 	uint64_t lsize = HDR_GET_LSIZE(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t asize = HDR_GET_L2SIZE(hdr);
 	arc_buf_contents_t type = hdr->b_type;
 	int64_t lsize_s;
 	int64_t psize_s;
 	int64_t asize_s;
 
 	/* For L2 we expect the header's b_l2size to be valid */
 	ASSERT3U(asize, >=, psize);
 
 	if (incr) {
 		lsize_s = lsize;
 		psize_s = psize;
 		asize_s = asize;
 	} else {
 		lsize_s = -lsize;
 		psize_s = -psize;
 		asize_s = -asize;
 	}
 
 	/* If the buffer is a prefetch, count it as such. */
 	if (HDR_PREFETCH(hdr)) {
 		ARCSTAT_INCR(arcstat_l2_prefetch_asize, asize_s);
 	} else {
 		/*
 		 * We use the value stored in the L2 header upon initial
 		 * caching in L2ARC. This value will be updated in case
 		 * an MRU/MRU_ghost buffer transitions to MFU but the L2ARC
 		 * metadata (log entry) cannot currently be updated. Having
 		 * the ARC state in the L2 header solves the problem of a
 		 * possibly absent L1 header (apparent in buffers restored
 		 * from persistent L2ARC).
 		 */
 		switch (hdr->b_l2hdr.b_arcs_state) {
 			case ARC_STATE_MRU_GHOST:
 			case ARC_STATE_MRU:
 				ARCSTAT_INCR(arcstat_l2_mru_asize, asize_s);
 				break;
 			case ARC_STATE_MFU_GHOST:
 			case ARC_STATE_MFU:
 				ARCSTAT_INCR(arcstat_l2_mfu_asize, asize_s);
 				break;
 			default:
 				break;
 		}
 	}
 
 	if (state_only)
 		return;
 
 	ARCSTAT_INCR(arcstat_l2_psize, psize_s);
 	ARCSTAT_INCR(arcstat_l2_lsize, lsize_s);
 
 	switch (type) {
 		case ARC_BUFC_DATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_data_asize, asize_s);
 			break;
 		case ARC_BUFC_METADATA:
 			ARCSTAT_INCR(arcstat_l2_bufc_metadata_asize, asize_s);
 			break;
 		default:
 			break;
 	}
 }
 
 
 static void
 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
 	l2arc_dev_t *dev = l2hdr->b_dev;
 
 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	list_remove(&dev->l2ad_buflist, hdr);
 
 	l2arc_hdr_arcstats_decrement(hdr);
 	if (dev->l2ad_vdev != NULL) {
 		uint64_t asize = HDR_GET_L2SIZE(hdr);
 		vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
 	}
 
 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
 	    hdr);
 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 }
 
 static void
 arc_hdr_destroy(arc_buf_hdr_t *hdr)
 {
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
 	}
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
 
 		if (!buflist_held)
 			mutex_enter(&dev->l2ad_mtx);
 
 		/*
 		 * Even though we checked this conditional above, we
 		 * need to check this again now that we have the
 		 * l2ad_mtx. This is because we could be racing with
 		 * another thread calling l2arc_evict() which might have
 		 * destroyed this header's L2 portion as we were waiting
 		 * to acquire the l2ad_mtx. If that happens, we don't
 		 * want to re-destroy the header's L2 portion.
 		 */
 		if (HDR_HAS_L2HDR(hdr)) {
 
 			if (!HDR_EMPTY(hdr))
 				buf_discard_identity(hdr);
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 
 		if (!buflist_held)
 			mutex_exit(&dev->l2ad_mtx);
 	}
 
 	/*
 	 * The header's identify can only be safely discarded once it is no
 	 * longer discoverable.  This requires removing it from the hash table
 	 * and the l2arc header list.  After this point the hash lock can not
 	 * be used to protect the header.
 	 */
 	if (!HDR_EMPTY(hdr))
 		buf_discard_identity(hdr);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_cksum_free(hdr);
 
 		while (hdr->b_l1hdr.b_buf != NULL)
 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
 
 		if (hdr->b_l1hdr.b_pabd != NULL)
 			arc_hdr_free_abd(hdr, B_FALSE);
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 
 	ASSERT0P(hdr->b_hash_next);
 	if (HDR_HAS_L1HDR(hdr)) {
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT0P(hdr->b_l1hdr.b_acb);
 #ifdef ZFS_DEBUG
 		ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
 #endif
 		kmem_cache_free(hdr_full_cache, hdr);
 	} else {
 		kmem_cache_free(hdr_l2only_cache, hdr);
 	}
 }
 
 void
 arc_buf_destroy(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		VERIFY0(remove_reference(hdr, tag));
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	ASSERT3P(hdr, ==, buf->b_hdr);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
 	ASSERT3P(buf->b_data, !=, NULL);
 
 	arc_buf_destroy_impl(buf);
 	(void) remove_reference(hdr, tag);
 	mutex_exit(hash_lock);
 }
 
 /*
  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
  * state of the header is dependent on its state prior to entering this
  * function. The following transitions are possible:
  *
  *    - arc_mru -> arc_mru_ghost
  *    - arc_mfu -> arc_mfu_ghost
  *    - arc_mru_ghost -> arc_l2c_only
  *    - arc_mru_ghost -> deleted
  *    - arc_mfu_ghost -> arc_l2c_only
  *    - arc_mfu_ghost -> deleted
  *    - arc_uncached -> deleted
  *
  * Return total size of evicted data buffers for eviction progress tracking.
  * When evicting from ghost states return logical buffer size to make eviction
  * progress at the same (or at least comparable) rate as from non-ghost states.
  *
  * Return *real_evicted for actual ARC size reduction to wake up threads
  * waiting for it.  For non-ghost states it includes size of evicted data
  * buffers (the headers are not freed there).  For ghost states it includes
  * only the evicted headers size.
  */
 static int64_t
 arc_evict_hdr(arc_buf_hdr_t *hdr, uint64_t *real_evicted)
 {
 	arc_state_t *evicted_state, *state;
 	int64_t bytes_evicted = 0;
 	uint_t min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
 	    arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT0P(hdr->b_l1hdr.b_buf);
 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
 
 	*real_evicted = 0;
 	state = hdr->b_l1hdr.b_state;
 	if (GHOST_STATE(state)) {
 
 		/*
 		 * l2arc_write_buffers() relies on a header's L1 portion
 		 * (i.e. its b_pabd field) during it's write phase.
 		 * Thus, we cannot push a header onto the arc_l2c_only
 		 * state (removing its L1 piece) until the header is
 		 * done being written to the l2arc.
 		 */
 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
 			return (bytes_evicted);
 		}
 
 		ARCSTAT_BUMP(arcstat_deleted);
 		bytes_evicted += HDR_GET_LSIZE(hdr);
 
 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			ASSERT0P(hdr->b_l1hdr.b_pabd);
 			ASSERT(!HDR_HAS_RABD(hdr));
 			/*
 			 * This buffer is cached on the 2nd Level ARC;
 			 * don't destroy the header.
 			 */
 			arc_change_state(arc_l2c_only, hdr);
 			/*
 			 * dropping from L1+L2 cached to L2-only,
 			 * realloc to remove the L1 header.
 			 */
 			(void) arc_hdr_realloc(hdr, hdr_full_cache,
 			    hdr_l2only_cache);
 			*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
 		} else {
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 			*real_evicted += HDR_FULL_SIZE;
 		}
 		return (bytes_evicted);
 	}
 
 	ASSERT(state == arc_mru || state == arc_mfu || state == arc_uncached);
 	evicted_state = (state == arc_uncached) ? arc_anon :
 	    ((state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost);
 
 	/* prefetch buffers have a minimum lifespan */
 	if ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
 	    MSEC_TO_TICK(min_lifetime)) {
 		ARCSTAT_BUMP(arcstat_evict_skip);
 		return (bytes_evicted);
 	}
 
 	if (HDR_HAS_L2HDR(hdr)) {
 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
 	} else {
 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
 			    HDR_GET_LSIZE(hdr));
 
 			switch (state->arcs_state) {
 				case ARC_STATE_MRU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mru,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				case ARC_STATE_MFU:
 					ARCSTAT_INCR(
 					    arcstat_evict_l2_eligible_mfu,
 					    HDR_GET_LSIZE(hdr));
 					break;
 				default:
 					break;
 			}
 		} else {
 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
 			    HDR_GET_LSIZE(hdr));
 		}
 	}
 
 	bytes_evicted += arc_hdr_size(hdr);
 	*real_evicted += arc_hdr_size(hdr);
 
 	/*
 	 * If this hdr is being evicted and has a compressed buffer then we
 	 * discard it here before we change states.  This ensures that the
 	 * accounting is updated correctly in arc_free_data_impl().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL)
 		arc_hdr_free_abd(hdr, B_FALSE);
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	arc_change_state(evicted_state, hdr);
 	DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
 	if (evicted_state == arc_anon) {
 		arc_hdr_destroy(hdr);
 		*real_evicted += HDR_FULL_SIZE;
 	} else {
 		ASSERT(HDR_IN_HASH_TABLE(hdr));
 	}
 
 	return (bytes_evicted);
 }
 
 static void
 arc_set_need_free(void)
 {
 	ASSERT(MUTEX_HELD(&arc_evict_lock));
 	int64_t remaining = arc_free_memory() - arc_sys_free / 2;
 	arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters);
 	if (aw == NULL) {
 		arc_need_free = MAX(-remaining, 0);
 	} else {
 		arc_need_free =
 		    MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count));
 	}
 }
 
 static uint64_t
 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
     uint64_t spa, uint64_t bytes)
 {
 	multilist_sublist_t *mls;
 	uint64_t bytes_evicted = 0, real_evicted = 0;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint_t evict_count = zfs_arc_evict_batch_limit;
 
 	ASSERT3P(marker, !=, NULL);
 
 	mls = multilist_sublist_lock_idx(ml, idx);
 
 	for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
 	    hdr = multilist_sublist_prev(mls, marker)) {
 		if ((evict_count == 0) || (bytes_evicted >= bytes))
 			break;
 
 		/*
 		 * To keep our iteration location, move the marker
 		 * forward. Since we're not holding hdr's hash lock, we
 		 * must be very careful and not remove 'hdr' from the
 		 * sublist. Otherwise, other consumers might mistake the
 		 * 'hdr' as not being on a sublist when they call the
 		 * multilist_link_active() function (they all rely on
 		 * the hash lock protecting concurrent insertions and
 		 * removals). multilist_sublist_move_forward() was
 		 * specifically implemented to ensure this is the case
 		 * (only 'marker' will be removed and re-inserted).
 		 */
 		multilist_sublist_move_forward(mls, marker);
 
 		/*
 		 * The only case where the b_spa field should ever be
 		 * zero, is the marker headers inserted by
 		 * arc_evict_state(). It's possible for multiple threads
 		 * to be calling arc_evict_state() concurrently (e.g.
 		 * dsl_pool_close() and zio_inject_fault()), so we must
 		 * skip any markers we see from these other threads.
 		 */
 		if (hdr->b_spa == 0)
 			continue;
 
 		/* we're only interested in evicting buffers of a certain spa */
 		if (spa != 0 && hdr->b_spa != spa) {
 			ARCSTAT_BUMP(arcstat_evict_skip);
 			continue;
 		}
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We aren't calling this function from any code path
 		 * that would already be holding a hash lock, so we're
 		 * asserting on this assumption to be defensive in case
 		 * this ever changes. Without this check, it would be
 		 * possible to incorrectly increment arcstat_mutex_miss
 		 * below (e.g. if the code changed such that we called
 		 * this function with a hash lock held).
 		 */
 		ASSERT(!MUTEX_HELD(hash_lock));
 
 		if (mutex_tryenter(hash_lock)) {
 			uint64_t revicted;
 			uint64_t evicted = arc_evict_hdr(hdr, &revicted);
 			mutex_exit(hash_lock);
 
 			bytes_evicted += evicted;
 			real_evicted += revicted;
 
 			/*
 			 * If evicted is zero, arc_evict_hdr() must have
 			 * decided to skip this header, don't increment
 			 * evict_count in this case.
 			 */
 			if (evicted != 0)
 				evict_count--;
 
 		} else {
 			ARCSTAT_BUMP(arcstat_mutex_miss);
 		}
 	}
 
 	multilist_sublist_unlock(mls);
 
 	/*
 	 * Increment the count of evicted bytes, and wake up any threads that
 	 * are waiting for the count to reach this value.  Since the list is
 	 * ordered by ascending aew_count, we pop off the beginning of the
 	 * list until we reach the end, or a waiter that's past the current
 	 * "count".  Doing this outside the loop reduces the number of times
 	 * we need to acquire the global arc_evict_lock.
 	 *
 	 * Only wake when there's sufficient free memory in the system
 	 * (specifically, arc_sys_free/2, which by default is a bit more than
 	 * 1/64th of RAM).  See the comments in arc_wait_for_eviction().
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_count += real_evicted;
 
 	if (arc_free_memory() > arc_sys_free / 2) {
 		arc_evict_waiter_t *aw;
 		while ((aw = list_head(&arc_evict_waiters)) != NULL &&
 		    aw->aew_count <= arc_evict_count) {
 			list_remove(&arc_evict_waiters, aw);
 			cv_broadcast(&aw->aew_cv);
 		}
 	}
 	arc_set_need_free();
 	mutex_exit(&arc_evict_lock);
 
 	/*
 	 * If the ARC size is reduced from arc_c_max to arc_c_min (especially
 	 * if the average cached block is small), eviction can be on-CPU for
 	 * many seconds.  To ensure that other threads that may be bound to
 	 * this CPU are able to make progress, make a voluntary preemption
 	 * call here.
 	 */
 	kpreempt(KPREEMPT_SYNC);
 
 	return (bytes_evicted);
 }
 
 static arc_buf_hdr_t *
 arc_state_alloc_marker(void)
 {
 	arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
 
 	/*
 	 * A b_spa of 0 is used to indicate that this header is
 	 * a marker. This fact is used in arc_evict_state_impl().
 	 */
 	marker->b_spa = 0;
 
 	return (marker);
 }
 
 static void
 arc_state_free_marker(arc_buf_hdr_t *marker)
 {
 	kmem_cache_free(hdr_full_cache, marker);
 }
 
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
  */
 static arc_buf_hdr_t **
 arc_state_alloc_markers(int count)
 {
 	arc_buf_hdr_t **markers;
 
 	markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
 	for (int i = 0; i < count; i++)
 		markers[i] = arc_state_alloc_marker();
 	return (markers);
 }
 
 static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
 	for (int i = 0; i < count; i++)
 		arc_state_free_marker(markers[i]);
 	kmem_free(markers, sizeof (*markers) * count);
 }
 
 typedef struct evict_arg {
 	taskq_ent_t		eva_tqent;
 	multilist_t		*eva_ml;
 	arc_buf_hdr_t		*eva_marker;
 	int			eva_idx;
 	uint64_t		eva_spa;
 	uint64_t		eva_bytes;
 	uint64_t		eva_evicted;
 } evict_arg_t;
 
 static void
 arc_evict_task(void *arg)
 {
 	evict_arg_t *eva = arg;
 	eva->eva_evicted = arc_evict_state_impl(eva->eva_ml, eva->eva_idx,
 	    eva->eva_marker, eva->eva_spa, eva->eva_bytes);
 }
 
 static void
 arc_evict_thread_init(void)
 {
 	if (zfs_arc_evict_threads == 0) {
 		/*
 		 * Compute number of threads we want to use for eviction.
 		 *
 		 * Normally, it's log2(ncpus) + ncpus/32, which gets us to the
 		 * default max of 16 threads at ~256 CPUs.
 		 *
 		 * However, that formula goes to two threads at 4 CPUs, which
 		 * is still rather to low to be really useful, so we just go
 		 * with 1 thread at fewer than 6 cores.
 		 */
 		if (max_ncpus < 6)
 			zfs_arc_evict_threads = 1;
 		else
 			zfs_arc_evict_threads =
 			    (highbit64(max_ncpus) - 1) + max_ncpus / 32;
 	} else if (zfs_arc_evict_threads > max_ncpus)
 		zfs_arc_evict_threads = max_ncpus;
 
 	if (zfs_arc_evict_threads > 1) {
 		arc_evict_taskq = taskq_create("arc_evict",
 		    zfs_arc_evict_threads, defclsyspri, 0, INT_MAX,
 		    TASKQ_PREPOPULATE);
 		arc_evict_arg = kmem_zalloc(
 		    sizeof (evict_arg_t) * zfs_arc_evict_threads, KM_SLEEP);
 	}
 }
 
 /*
  * The minimum number of bytes we can evict at once is a block size.
  * So, SPA_MAXBLOCKSIZE is a reasonable minimal value per an eviction task.
  * We use this value to compute a scaling factor for the eviction tasks.
  */
 #define	MIN_EVICT_SIZE	(SPA_MAXBLOCKSIZE)
 
 /*
  * Evict buffers from the given arc state, until we've removed the
  * specified number of bytes. Move the removed buffers to the
  * appropriate evict state.
  *
  * This function makes a "best effort". It skips over any buffers
  * it can't get a hash_lock on, and so, may not catch all candidates.
  * It may also return without evicting as much space as requested.
  *
  * If bytes is specified using the special value ARC_EVICT_ALL, this
  * will evict all available (i.e. unlocked and evictable) buffers from
  * the given arc state; which is used by arc_flush().
  */
 static uint64_t
 arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
     uint64_t bytes)
 {
 	uint64_t total_evicted = 0;
 	multilist_t *ml = &state->arcs_list[type];
 	int num_sublists;
 	arc_buf_hdr_t **markers;
 	evict_arg_t *eva = NULL;
 
 	num_sublists = multilist_get_num_sublists(ml);
 
 	boolean_t use_evcttq = zfs_arc_evict_threads > 1;
 
 	/*
 	 * If we've tried to evict from each sublist, made some
 	 * progress, but still have not hit the target number of bytes
 	 * to evict, we want to keep trying. The markers allow us to
 	 * pick up where we left off for each individual sublist, rather
 	 * than starting from the tail each time.
 	 */
 	if (zthr_iscurthread(arc_evict_zthr)) {
 		markers = arc_state_evict_markers;
 		ASSERT3S(num_sublists, <=, arc_state_evict_marker_count);
 	} else {
 		markers = arc_state_alloc_markers(num_sublists);
 	}
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls;
 
 		mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_insert_tail(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	if (use_evcttq) {
 		if (zthr_iscurthread(arc_evict_zthr))
 			eva = arc_evict_arg;
 		else
 			eva = kmem_alloc(sizeof (evict_arg_t) *
 			    zfs_arc_evict_threads, KM_NOSLEEP);
 		if (eva) {
 			for (int i = 0; i < zfs_arc_evict_threads; i++) {
 				taskq_init_ent(&eva[i].eva_tqent);
 				eva[i].eva_ml = ml;
 				eva[i].eva_spa = spa;
 			}
 		} else {
 			/*
 			 * Fall back to the regular single evict if it is not
 			 * possible to allocate memory for the taskq entries.
 			 */
 			use_evcttq = B_FALSE;
 		}
 	}
 
 	/*
 	 * Start eviction using a randomly selected sublist, this is to try and
 	 * evenly balance eviction across all sublists. Always starting at the
 	 * same sublist (e.g. index 0) would cause evictions to favor certain
 	 * sublists over others.
 	 */
 	uint64_t scan_evicted = 0;
 	int sublists_left = num_sublists;
 	int sublist_idx = multilist_get_random_index(ml);
 
 	/*
 	 * While we haven't hit our target number of bytes to evict, or
 	 * we're evicting all available buffers.
 	 */
 	while (total_evicted < bytes) {
 		uint64_t evict = MIN_EVICT_SIZE;
 		uint_t ntasks = zfs_arc_evict_threads;
 
 		if (use_evcttq) {
 			if (sublists_left < ntasks)
 				ntasks = sublists_left;
 
 			if (ntasks < 2)
 				use_evcttq = B_FALSE;
 		}
 
 		if (use_evcttq) {
 			uint64_t left = bytes - total_evicted;
 
 			if (bytes == ARC_EVICT_ALL) {
 				evict = bytes;
 			} else if (left > ntasks * MIN_EVICT_SIZE) {
 				evict = DIV_ROUND_UP(left, ntasks);
 			} else {
 				ntasks = DIV_ROUND_UP(left, MIN_EVICT_SIZE);
 				if (ntasks == 1)
 					use_evcttq = B_FALSE;
 			}
 		}
 
 		for (int i = 0; sublists_left > 0; i++, sublist_idx++,
 		    sublists_left--) {
 			uint64_t bytes_remaining;
 			uint64_t bytes_evicted;
 
 			/* we've reached the end, wrap to the beginning */
 			if (sublist_idx >= num_sublists)
 				sublist_idx = 0;
 
 			if (use_evcttq) {
 				if (i == ntasks)
 					break;
 
 				eva[i].eva_marker = markers[sublist_idx];
 				eva[i].eva_idx = sublist_idx;
 				eva[i].eva_bytes = evict;
 
 				taskq_dispatch_ent(arc_evict_taskq,
 				    arc_evict_task, &eva[i], 0,
 				    &eva[i].eva_tqent);
 
 				continue;
 			}
 
 			if (total_evicted < bytes)
 				bytes_remaining = bytes - total_evicted;
 			else
 				break;
 
 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
 			    markers[sublist_idx], spa, bytes_remaining);
 
 			scan_evicted += bytes_evicted;
 			total_evicted += bytes_evicted;
 		}
 
 		if (use_evcttq) {
 			taskq_wait(arc_evict_taskq);
 
 			for (int i = 0; i < ntasks; i++) {
 				scan_evicted += eva[i].eva_evicted;
 				total_evicted += eva[i].eva_evicted;
 			}
 		}
 
 		/*
 		 * If we scanned all sublists and didn't evict anything, we
 		 * have no reason to believe we'll evict more during another
 		 * scan, so break the loop.
 		 */
 		if (scan_evicted == 0 && sublists_left == 0) {
 			/* This isn't possible, let's make that obvious */
 			ASSERT3S(bytes, !=, 0);
 
 			/*
 			 * When bytes is ARC_EVICT_ALL, the only way to
 			 * break the loop is when scan_evicted is zero.
 			 * In that case, we actually have evicted enough,
 			 * so we don't want to increment the kstat.
 			 */
 			if (bytes != ARC_EVICT_ALL) {
 				ASSERT3S(total_evicted, <, bytes);
 				ARCSTAT_BUMP(arcstat_evict_not_enough);
 			}
 
 			break;
 		}
 
 		/*
 		 * If we scanned all sublists but still have more to do,
 		 * reset the counts so we can go around again.
 		 */
 		if (sublists_left == 0) {
 			sublists_left = num_sublists;
 			sublist_idx = multilist_get_random_index(ml);
 			scan_evicted = 0;
 
 			/*
 			 * Since we're about to reconsider all sublists,
 			 * re-enable use of the evict threads if available.
 			 */
 			use_evcttq = (zfs_arc_evict_threads > 1 && eva != NULL);
 		}
 	}
 
 	if (eva != NULL && eva != arc_evict_arg)
 		kmem_free(eva, sizeof (evict_arg_t) * zfs_arc_evict_threads);
 
 	for (int i = 0; i < num_sublists; i++) {
 		multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
 		multilist_sublist_remove(mls, markers[i]);
 		multilist_sublist_unlock(mls);
 	}
 
 	if (markers != arc_state_evict_markers)
 		arc_state_free_markers(markers, num_sublists);
 
 	return (total_evicted);
 }
 
 /*
  * Flush all "evictable" data of the given type from the arc state
  * specified. This will not evict any "active" buffers (i.e. referenced).
  *
  * When 'retry' is set to B_FALSE, the function will make a single pass
  * over the state and evict any buffers that it can. Since it doesn't
  * continually retry the eviction, it might end up leaving some buffers
  * in the ARC due to lock misses.
  *
  * When 'retry' is set to B_TRUE, the function will continually retry the
  * eviction until *all* evictable buffers have been removed from the
  * state. As a result, if concurrent insertions into the state are
  * allowed (e.g. if the ARC isn't shutting down), this function might
  * wind up in an infinite loop, continually trying to evict buffers.
  */
 static uint64_t
 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
     boolean_t retry)
 {
 	uint64_t evicted = 0;
 
 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
 		evicted += arc_evict_state(state, type, spa, ARC_EVICT_ALL);
 
 		if (!retry)
 			break;
 	}
 
 	return (evicted);
 }
 
 /*
  * Evict the specified number of bytes from the state specified. This
  * function prevents us from trying to evict more from a state's list
  * than is "evictable", and to skip evicting altogether when passed a
  * negative value for "bytes". In contrast, arc_evict_state() will
  * evict everything it can, when passed a negative value for "bytes".
  */
 static uint64_t
 arc_evict_impl(arc_state_t *state, arc_buf_contents_t type, int64_t bytes)
 {
 	uint64_t delta;
 
 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
 		    bytes);
 		return (arc_evict_state(state, type, 0, delta));
 	}
 
 	return (0);
 }
 
 /*
  * Adjust specified fraction, taking into account initial ghost state(s) size,
  * ghost hit bytes towards increasing the fraction, ghost hit bytes towards
  * decreasing it, plus a balance factor, controlling the decrease rate, used
  * to balance metadata vs data.
  */
 static uint64_t
 arc_evict_adj(uint64_t frac, uint64_t total, uint64_t up, uint64_t down,
     uint_t balance)
 {
 	if (total < 32 || up + down == 0)
 		return (frac);
 
 	/*
 	 * We should not have more ghost hits than ghost size, but they may
 	 * get close.  To avoid overflows below up/down should not be bigger
 	 * than 1/5 of total.  But to limit maximum adjustment speed restrict
 	 * it some more.
 	 */
 	if (up + down >= total / 16) {
 		uint64_t scale = (up + down) / (total / 32);
 		up /= scale;
 		down /= scale;
 	}
 
 	/* Get maximal dynamic range by choosing optimal shifts. */
 	int s = highbit64(total);
 	s = MIN(64 - s, 32);
 
 	ASSERT3U(frac, <=, 1ULL << 32);
 	uint64_t ofrac = (1ULL << 32) - frac;
 
 	if (frac >= 4 * ofrac)
 		up /= frac / (2 * ofrac + 1);
 	up = (up << s) / (total >> (32 - s));
 	if (ofrac >= 4 * frac)
 		down /= ofrac / (2 * frac + 1);
 	down = (down << s) / (total >> (32 - s));
 	down = down * 100 / balance;
 
 	ASSERT3U(up, <=, (1ULL << 32) - frac);
 	ASSERT3U(down, <=, frac);
 	return (frac + up - down);
 }
 
 /*
  * Calculate (x * multiplier / divisor) without unnecesary overflows.
  */
 static uint64_t
 arc_mf(uint64_t x, uint64_t multiplier, uint64_t divisor)
 {
 	uint64_t q = (x / divisor);
 	uint64_t r = (x % divisor);
 
 	return ((q * multiplier) + ((r * multiplier) / divisor));
 }
 
 /*
  * Evict buffers from the cache, such that arcstat_size is capped by arc_c.
  */
 static uint64_t
 arc_evict(void)
 {
 	uint64_t bytes, total_evicted = 0;
 	int64_t e, mrud, mrum, mfud, mfum, w;
 	static uint64_t ogrd, ogrm, ogfd, ogfm;
 	static uint64_t gsrd, gsrm, gsfd, gsfm;
 	uint64_t ngrd, ngrm, ngfd, ngfm;
 
 	/* Get current size of ARC states we can evict from. */
 	mrud = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	mrum = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	mfud = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	mfum = zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	uint64_t d = mrud + mfud;
 	uint64_t m = mrum + mfum;
 	uint64_t t = d + m;
 
 	/* Get ARC ghost hits since last eviction. */
 	ngrd = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t grd = ngrd - ogrd;
 	ogrd = ngrd;
 	ngrm = wmsum_value(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t grm = ngrm - ogrm;
 	ogrm = ngrm;
 	ngfd = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	uint64_t gfd = ngfd - ogfd;
 	ogfd = ngfd;
 	ngfm = wmsum_value(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	uint64_t gfm = ngfm - ogfm;
 	ogfm = ngfm;
 
 	/* Adjust ARC states balance based on ghost hits. */
 	arc_meta = arc_evict_adj(arc_meta, gsrd + gsrm + gsfd + gsfm,
 	    grm + gfm, grd + gfd, zfs_arc_meta_balance);
 	arc_pd = arc_evict_adj(arc_pd, gsrd + gsfd, grd, gfd, 100);
 	arc_pm = arc_evict_adj(arc_pm, gsrm + gsfm, grm, gfm, 100);
 
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 	uint64_t ac = arc_c;
 	int64_t wt = t - (asize - ac);
 
 	/*
 	 * Try to reduce pinned dnodes if more than 3/4 of wanted metadata
 	 * target is not evictable or if they go over arc_dnode_limit.
 	 */
 	int64_t prune = 0;
 	int64_t dn = aggsum_value(&arc_sums.arcstat_dnode_size);
 	int64_t nem = zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA])
 	    + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA])
 	    - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA])
 	    - zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	if (nem > w * 3 / 4) {
 		prune = dn / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100;
 		if (nem < w && w > 4)
 			prune = arc_mf(prune, nem - w * 3 / 4, w / 4);
 	}
 	if (dn > arc_dnode_limit) {
 		prune = MAX(prune, (dn - arc_dnode_limit) / sizeof (dnode_t) *
 		    zfs_arc_dnode_reduce_percent / 100);
 	}
 	if (prune > 0)
 		arc_prune_async(prune);
 
 	/* Evict MRU metadata. */
 	w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16;
 	e = MIN((int64_t)(asize - ac), (int64_t)(mrum - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mrum -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU metadata. */
 	w = wt * (int64_t)(arc_meta >> 16) >> 16;
 	e = MIN((int64_t)(asize - ac), (int64_t)(m - bytes - w));
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e);
 	total_evicted += bytes;
 	mfum -= bytes;
 	asize -= bytes;
 
 	/* Evict MRU data. */
 	wt -= m - total_evicted;
 	w = wt * (int64_t)(arc_pd >> 16) >> 16;
 	e = MIN((int64_t)(asize - ac), (int64_t)(mrud - w));
 	bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e);
 	total_evicted += bytes;
 	mrud -= bytes;
 	asize -= bytes;
 
 	/* Evict MFU data. */
 	e = asize - ac;
 	bytes = arc_evict_impl(arc_mfu, ARC_BUFC_DATA, e);
 	mfud -= bytes;
 	total_evicted += bytes;
 
 	/*
 	 * Evict ghost lists
 	 *
 	 * Size of each state's ghost list represents how much that state
 	 * may grow by shrinking the other states.  Would it need to shrink
 	 * other states to zero (that is unlikely), its ghost size would be
 	 * equal to sum of other three state sizes.  But excessive ghost
 	 * size may result in false ghost hits (too far back), that may
 	 * never result in real cache hits if several states are competing.
 	 * So choose some arbitraty point of 1/2 of other state sizes.
 	 */
 	gsrd = (mrum + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsrd;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_DATA, e);
 
 	gsrm = (mrud + mfud + mfum) / 2;
 	e = zfs_refcount_count(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsrm;
 	(void) arc_evict_impl(arc_mru_ghost, ARC_BUFC_METADATA, e);
 
 	gsfd = (mrud + mrum + mfum) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]) -
 	    gsfd;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_DATA, e);
 
 	gsfm = (mrud + mrum + mfud) / 2;
 	e = zfs_refcount_count(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]) -
 	    gsfm;
 	(void) arc_evict_impl(arc_mfu_ghost, ARC_BUFC_METADATA, e);
 
 	return (total_evicted);
 }
 
 static void
 arc_flush_impl(uint64_t guid, boolean_t retry)
 {
 	ASSERT(!retry || guid == 0);
 
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
 
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_DATA, retry);
 	(void) arc_flush_state(arc_uncached, guid, ARC_BUFC_METADATA, retry);
 }
 
 void
 arc_flush(spa_t *spa, boolean_t retry)
 {
 	/*
 	 * If retry is B_TRUE, a spa must not be specified since we have
 	 * no good way to determine if all of a spa's buffers have been
 	 * evicted from an arc state.
 	 */
 	ASSERT(!retry || spa == NULL);
 
 	arc_flush_impl(spa != NULL ? spa_load_guid(spa) : 0, retry);
 }
 
 static arc_async_flush_t *
 arc_async_flush_add(uint64_t spa_guid, uint_t level)
 {
 	arc_async_flush_t *af = kmem_alloc(sizeof (*af), KM_SLEEP);
 	af->af_spa_guid = spa_guid;
 	af->af_cache_level = level;
 	taskq_init_ent(&af->af_tqent);
 	list_link_init(&af->af_node);
 
 	mutex_enter(&arc_async_flush_lock);
 	list_insert_tail(&arc_async_flush_list, af);
 	mutex_exit(&arc_async_flush_lock);
 
 	return (af);
 }
 
 static void
 arc_async_flush_remove(uint64_t spa_guid, uint_t level)
 {
 	mutex_enter(&arc_async_flush_lock);
 	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
 	    af != NULL; af = list_next(&arc_async_flush_list, af)) {
 		if (af->af_spa_guid == spa_guid &&
 		    af->af_cache_level == level) {
 			list_remove(&arc_async_flush_list, af);
 			kmem_free(af, sizeof (*af));
 			break;
 		}
 	}
 	mutex_exit(&arc_async_flush_lock);
 }
 
 static void
 arc_flush_task(void *arg)
 {
 	arc_async_flush_t *af = arg;
 	hrtime_t start_time = gethrtime();
 	uint64_t spa_guid = af->af_spa_guid;
 
 	arc_flush_impl(spa_guid, B_FALSE);
 	arc_async_flush_remove(spa_guid, af->af_cache_level);
 
 	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
 	if (elaspsed > 0) {
 		zfs_dbgmsg("spa %llu arc flushed in %llu ms",
 		    (u_longlong_t)spa_guid, (u_longlong_t)elaspsed);
 	}
 }
 
 /*
  * ARC buffers use the spa's load guid and can continue to exist after
  * the spa_t is gone (exported). The blocks are orphaned since each
  * spa import has a different load guid.
  *
  * It's OK if the spa is re-imported while this asynchronous flush is
  * still in progress. The new spa_load_guid will be different.
  *
  * Also, arc_fini will wait for any arc_flush_task to finish.
  */
 void
 arc_flush_async(spa_t *spa)
 {
 	uint64_t spa_guid = spa_load_guid(spa);
 	arc_async_flush_t *af = arc_async_flush_add(spa_guid, 1);
 
 	taskq_dispatch_ent(arc_flush_taskq, arc_flush_task,
 	    af, TQ_SLEEP, &af->af_tqent);
 }
 
 /*
  * Check if a guid is still in-use as part of an async teardown task
  */
 boolean_t
 arc_async_flush_guid_inuse(uint64_t spa_guid)
 {
 	mutex_enter(&arc_async_flush_lock);
 	for (arc_async_flush_t *af = list_head(&arc_async_flush_list);
 	    af != NULL; af = list_next(&arc_async_flush_list, af)) {
 		if (af->af_spa_guid == spa_guid) {
 			mutex_exit(&arc_async_flush_lock);
 			return (B_TRUE);
 		}
 	}
 	mutex_exit(&arc_async_flush_lock);
 	return (B_FALSE);
 }
 
 uint64_t
 arc_reduce_target_size(uint64_t to_free)
 {
 	/*
 	 * Get the actual arc size.  Even if we don't need it, this updates
 	 * the aggsum lower bound estimate for arc_is_overflowing().
 	 */
 	uint64_t asize = aggsum_value(&arc_sums.arcstat_size);
 
 	/*
 	 * All callers want the ARC to actually evict (at least) this much
 	 * memory.  Therefore we reduce from the lower of the current size and
 	 * the target size.  This way, even if arc_c is much higher than
 	 * arc_size (as can be the case after many calls to arc_freed(), we will
 	 * immediately have arc_c < arc_size and therefore the arc_evict_zthr
 	 * will evict.
 	 */
 	uint64_t c = arc_c;
 	if (c > arc_c_min) {
 		c = MIN(c, MAX(asize, arc_c_min));
 		to_free = MIN(to_free, c - arc_c_min);
 		arc_c = c - to_free;
 	} else {
 		to_free = 0;
 	}
 
 	/*
 	 * Since dbuf cache size is a fraction of target ARC size, we should
 	 * notify dbuf about the reduction, which might be significant,
 	 * especially if current ARC size was much smaller than the target.
 	 */
 	dbuf_cache_reduce_target_size();
 
 	/*
 	 * Whether or not we reduced the target size, request eviction if the
 	 * current size is over it now, since caller obviously wants some RAM.
 	 */
 	if (asize > arc_c) {
 		/* See comment in arc_evict_cb_check() on why lock+flag */
 		mutex_enter(&arc_evict_lock);
 		arc_evict_needed = B_TRUE;
 		mutex_exit(&arc_evict_lock);
 		zthr_wakeup(arc_evict_zthr);
 	}
 
 	return (to_free);
 }
 
 /*
  * Determine if the system is under memory pressure and is asking
  * to reclaim memory. A return value of B_TRUE indicates that the system
  * is under memory pressure and that the arc should adjust accordingly.
  */
 boolean_t
 arc_reclaim_needed(void)
 {
 	return (arc_available_memory() < 0);
 }
 
 void
 arc_kmem_reap_soon(void)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
 #ifdef _KERNEL
 #if defined(_ILP32)
 	/*
 	 * Reclaim unused memory from all kmem caches.
 	 */
 	kmem_reap();
 #endif
 #endif
 
 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
 #if defined(_ILP32)
 		/* reach upper limit of cache size on 32-bit */
 		if (zio_buf_cache[i] == NULL)
 			break;
 #endif
 		if (zio_buf_cache[i] != prev_cache) {
 			prev_cache = zio_buf_cache[i];
 			kmem_cache_reap_now(zio_buf_cache[i]);
 		}
 		if (zio_data_buf_cache[i] != prev_data_cache) {
 			prev_data_cache = zio_data_buf_cache[i];
 			kmem_cache_reap_now(zio_data_buf_cache[i]);
 		}
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_full_cache);
 	kmem_cache_reap_now(hdr_l2only_cache);
 	kmem_cache_reap_now(zfs_btree_leaf_cache);
 	abd_cache_reap_now();
 }
 
 static boolean_t
 arc_evict_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 #ifdef ZFS_DEBUG
 	/*
 	 * This is necessary in order to keep the kstat information
 	 * up to date for tools that display kstat data such as the
 	 * mdb ::arc dcmd and the Linux crash utility.  These tools
 	 * typically do not call kstat's update function, but simply
 	 * dump out stats from the most recent update.  Without
 	 * this call, these commands may show stale stats for the
 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists.  Even
 	 * with this call, the data might be out of date if the
 	 * evict thread hasn't been woken recently; but that should
 	 * suffice.  The arc_state_t structures can be queried
 	 * directly if more accurate information is needed.
 	 */
 	if (arc_ksp != NULL)
 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
 #endif
 
 	/*
 	 * We have to rely on arc_wait_for_eviction() to tell us when to
 	 * evict, rather than checking if we are overflowing here, so that we
 	 * are sure to not leave arc_wait_for_eviction() waiting on aew_cv.
 	 * If we have become "not overflowing" since arc_wait_for_eviction()
 	 * checked, we need to wake it up.  We could broadcast the CV here,
 	 * but arc_wait_for_eviction() may have not yet gone to sleep.  We
 	 * would need to use a mutex to ensure that this function doesn't
 	 * broadcast until arc_wait_for_eviction() has gone to sleep (e.g.
 	 * the arc_evict_lock).  However, the lock ordering of such a lock
 	 * would necessarily be incorrect with respect to the zthr_lock,
 	 * which is held before this function is called, and is held by
 	 * arc_wait_for_eviction() when it calls zthr_wakeup().
 	 */
 	if (arc_evict_needed)
 		return (B_TRUE);
 
 	/*
 	 * If we have buffers in uncached state, evict them periodically.
 	 */
 	return ((zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]) &&
 	    ddi_get_lbolt() - arc_last_uncached_flush >
 	    MSEC_TO_TICK(arc_min_prefetch_ms / 2)));
 }
 
 /*
  * Keep arc_size under arc_c by running arc_evict which evicts data
  * from the ARC.
  */
 static void
 arc_evict_cb(void *arg, zthr_t *zthr)
 {
 	(void) arg;
 
 	uint64_t evicted = 0;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/* Always try to evict from uncached state. */
 	arc_last_uncached_flush = ddi_get_lbolt();
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_DATA, B_FALSE);
 	evicted += arc_flush_state(arc_uncached, 0, ARC_BUFC_METADATA, B_FALSE);
 
 	/* Evict from other states only if told to. */
 	if (arc_evict_needed)
 		evicted += arc_evict();
 
 	/*
 	 * If evicted is zero, we couldn't evict anything
 	 * via arc_evict(). This could be due to hash lock
 	 * collisions, but more likely due to the majority of
 	 * arc buffers being unevictable. Therefore, even if
 	 * arc_size is above arc_c, another pass is unlikely to
 	 * be helpful and could potentially cause us to enter an
 	 * infinite loop.  Additionally, zthr_iscancelled() is
 	 * checked here so that if the arc is shutting down, the
 	 * broadcast will wake any remaining arc evict waiters.
 	 *
 	 * Note we cancel using zthr instead of arc_evict_zthr
 	 * because the latter may not yet be initializd when the
 	 * callback is first invoked.
 	 */
 	mutex_enter(&arc_evict_lock);
 	arc_evict_needed = !zthr_iscancelled(zthr) &&
 	    evicted > 0 && aggsum_compare(&arc_sums.arcstat_size, arc_c) > 0;
 	if (!arc_evict_needed) {
 		/*
 		 * We're either no longer overflowing, or we
 		 * can't evict anything more, so we should wake
 		 * arc_get_data_impl() sooner.
 		 */
 		arc_evict_waiter_t *aw;
 		while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) {
 			cv_broadcast(&aw->aew_cv);
 		}
 		arc_set_need_free();
 	}
 	mutex_exit(&arc_evict_lock);
 	spl_fstrans_unmark(cookie);
 }
 
 static boolean_t
 arc_reap_cb_check(void *arg, zthr_t *zthr)
 {
 	(void) arg, (void) zthr;
 
 	int64_t free_memory = arc_available_memory();
 	static int reap_cb_check_counter = 0;
 
 	/*
 	 * If a kmem reap is already active, don't schedule more.  We must
 	 * check for this because kmem_cache_reap_soon() won't actually
 	 * block on the cache being reaped (this is to prevent callers from
 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
 	 * on a system with many, many full magazines, can take minutes).
 	 */
 	if (!kmem_cache_reap_active() && free_memory < 0) {
 
 		arc_no_grow = B_TRUE;
 		arc_warm = B_TRUE;
 		/*
 		 * Wait at least zfs_grow_retry (default 5) seconds
 		 * before considering growing.
 		 */
 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
 		return (B_TRUE);
 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
 		arc_no_grow = B_TRUE;
 	} else if (gethrtime() >= arc_growtime) {
 		arc_no_grow = B_FALSE;
 	}
 
 	/*
 	 * Called unconditionally every 60 seconds to reclaim unused
 	 * zstd compression and decompression context. This is done
 	 * here to avoid the need for an independent thread.
 	 */
 	if (!((reap_cb_check_counter++) % 60))
 		zfs_zstd_cache_reap_now();
 
 	return (B_FALSE);
 }
 
 /*
  * Keep enough free memory in the system by reaping the ARC's kmem
  * caches.  To cause more slabs to be reapable, we may reduce the
  * target size of the cache (arc_c), causing the arc_evict_cb()
  * to free more buffers.
  */
 static void
 arc_reap_cb(void *arg, zthr_t *zthr)
 {
 	int64_t can_free, free_memory, to_free;
 
 	(void) arg, (void) zthr;
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	/*
 	 * Kick off asynchronous kmem_reap()'s of all our caches.
 	 */
 	arc_kmem_reap_soon();
 
 	/*
 	 * Wait at least arc_kmem_cache_reap_retry_ms between
 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
 	 * end up in a situation where we spend lots of time reaping
 	 * caches, while we're near arc_c_min.  Waiting here also gives the
 	 * subsequent free memory check a chance of finding that the
 	 * asynchronous reap has already freed enough memory, and we don't
 	 * need to call arc_reduce_target_size().
 	 */
 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
 
 	/*
 	 * Reduce the target size as needed to maintain the amount of free
 	 * memory in the system at a fraction of the arc_size (1/128th by
 	 * default).  If oversubscribed (free_memory < 0) then reduce the
 	 * target arc_size by the deficit amount plus the fractional
 	 * amount.  If free memory is positive but less than the fractional
 	 * amount, reduce by what is needed to hit the fractional amount.
 	 */
 	free_memory = arc_available_memory();
 	can_free = arc_c - arc_c_min;
 	to_free = (MAX(can_free, 0) >> arc_shrink_shift) - free_memory;
 	if (to_free > 0)
 		arc_reduce_target_size(to_free);
 	spl_fstrans_unmark(cookie);
 }
 
 #ifdef _KERNEL
 /*
  * Determine the amount of memory eligible for eviction contained in the
  * ARC. All clean data reported by the ghost lists can always be safely
  * evicted. Due to arc_c_min, the same does not hold for all clean data
  * contained by the regular mru and mfu lists.
  *
  * In the case of the regular mru and mfu lists, we need to report as
  * much clean data as possible, such that evicting that same reported
  * data will not bring arc_size below arc_c_min. Thus, in certain
  * circumstances, the total amount of clean data in the mru and mfu
  * lists might not actually be evictable.
  *
  * The following two distinct cases are accounted for:
  *
  * 1. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is greater than or equal to arc_c_min.
  *    (i.e. amount of dirty data >= arc_c_min)
  *
  *    This is the easy case; all clean data contained by the mru and mfu
  *    lists is evictable. Evicting all clean data can only drop arc_size
  *    to the amount of dirty data, which is greater than arc_c_min.
  *
  * 2. The sum of the amount of dirty data contained by both the mru and
  *    mfu lists, plus the ARC's other accounting (e.g. the anon list),
  *    is less than arc_c_min.
  *    (i.e. arc_c_min > amount of dirty data)
  *
  *    2.1. arc_size is greater than or equal arc_c_min.
  *         (i.e. arc_size >= arc_c_min > amount of dirty data)
  *
  *         In this case, not all clean data from the regular mru and mfu
  *         lists is actually evictable; we must leave enough clean data
  *         to keep arc_size above arc_c_min. Thus, the maximum amount of
  *         evictable data from the two lists combined, is exactly the
  *         difference between arc_size and arc_c_min.
  *
  *    2.2. arc_size is less than arc_c_min
  *         (i.e. arc_c_min > arc_size > amount of dirty data)
  *
  *         In this case, none of the data contained in the mru and mfu
  *         lists is evictable, even if it's clean. Since arc_size is
  *         already below arc_c_min, evicting any more would only
  *         increase this negative difference.
  */
 
 #endif /* _KERNEL */
 
 /*
  * Adapt arc info given the number of bytes we are trying to add and
  * the state that we are coming from.  This function is only called
  * when we are adding new content to the cache.
  */
 static void
 arc_adapt(uint64_t bytes)
 {
 	/*
 	 * Wake reap thread if we do not have any available memory
 	 */
 	if (arc_reclaim_needed()) {
 		zthr_wakeup(arc_reap_zthr);
 		return;
 	}
 
 	if (arc_no_grow)
 		return;
 
 	if (arc_c >= arc_c_max)
 		return;
 
 	/*
 	 * If we're within (2 * maxblocksize) bytes of the target
 	 * cache size, increment the target cache size
 	 */
 	if (aggsum_upper_bound(&arc_sums.arcstat_size) +
 	    2 * SPA_MAXBLOCKSIZE >= arc_c) {
 		uint64_t dc = MAX(bytes, SPA_OLD_MAXBLOCKSIZE);
 		if (atomic_add_64_nv(&arc_c, dc) > arc_c_max)
 			arc_c = arc_c_max;
 	}
 }
 
 /*
  * Check if ARC current size has grown past our upper thresholds.
  */
 static arc_ovf_level_t
 arc_is_overflowing(boolean_t lax, boolean_t use_reserve)
 {
 	/*
 	 * We just compare the lower bound here for performance reasons. Our
 	 * primary goals are to make sure that the arc never grows without
 	 * bound, and that it can reach its maximum size. This check
 	 * accomplishes both goals. The maximum amount we could run over by is
 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
 	 * in the ARC. In practice, that's in the tens of MB, which is low
 	 * enough to be safe.
 	 */
 	int64_t arc_over = aggsum_lower_bound(&arc_sums.arcstat_size) - arc_c -
 	    zfs_max_recordsize;
 	int64_t dn_over = aggsum_lower_bound(&arc_sums.arcstat_dnode_size) -
 	    arc_dnode_limit;
 
 	/* Always allow at least one block of overflow. */
 	if (arc_over < 0 && dn_over <= 0)
 		return (ARC_OVF_NONE);
 
 	/* If we are under memory pressure, report severe overflow. */
 	if (!lax)
 		return (ARC_OVF_SEVERE);
 
 	/* We are not under pressure, so be more or less relaxed. */
 	int64_t overflow = (arc_c >> zfs_arc_overflow_shift) / 2;
 	if (use_reserve)
 		overflow *= 3;
 	return (arc_over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
 }
 
 static abd_t *
 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, alloc_flags);
 	if (alloc_flags & ARC_HDR_ALLOC_LINEAR)
 		return (abd_alloc_linear(size, type == ARC_BUFC_METADATA));
 	else
 		return (abd_alloc(size, type == ARC_BUFC_METADATA));
 }
 
 static void *
 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_get_data_impl(hdr, size, tag, 0);
 	if (type == ARC_BUFC_METADATA) {
 		return (zio_buf_alloc(size));
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		return (zio_data_buf_alloc(size));
 	}
 }
 
 /*
  * Wait for the specified amount of data (in bytes) to be evicted from the
  * ARC, and for there to be sufficient free memory in the system.
  * The lax argument specifies that caller does not have a specific reason
  * to wait, not aware of any memory pressure.  Low memory handlers though
  * should set it to B_FALSE to wait for all required evictions to complete.
  * The use_reserve argument allows some callers to wait less than others
  * to not block critical code paths, possibly blocking other resources.
  */
 void
 arc_wait_for_eviction(uint64_t amount, boolean_t lax, boolean_t use_reserve)
 {
 	switch (arc_is_overflowing(lax, use_reserve)) {
 	case ARC_OVF_NONE:
 		return;
 	case ARC_OVF_SOME:
 		/*
 		 * This is a bit racy without taking arc_evict_lock, but the
 		 * worst that can happen is we either call zthr_wakeup() extra
 		 * time due to race with other thread here, or the set flag
 		 * get cleared by arc_evict_cb(), which is unlikely due to
 		 * big hysteresis, but also not important since at this level
 		 * of overflow the eviction is purely advisory.  Same time
 		 * taking the global lock here every time without waiting for
 		 * the actual eviction creates a significant lock contention.
 		 */
 		if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		return;
 	case ARC_OVF_SEVERE:
 	default:
 	{
 		arc_evict_waiter_t aw;
 		list_link_init(&aw.aew_node);
 		cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
 
 		uint64_t last_count = 0;
 		mutex_enter(&arc_evict_lock);
 		if (!list_is_empty(&arc_evict_waiters)) {
 			arc_evict_waiter_t *last =
 			    list_tail(&arc_evict_waiters);
 			last_count = last->aew_count;
 		} else if (!arc_evict_needed) {
 			arc_evict_needed = B_TRUE;
 			zthr_wakeup(arc_evict_zthr);
 		}
 		/*
 		 * Note, the last waiter's count may be less than
 		 * arc_evict_count if we are low on memory in which
 		 * case arc_evict_state_impl() may have deferred
 		 * wakeups (but still incremented arc_evict_count).
 		 */
 		aw.aew_count = MAX(last_count, arc_evict_count) + amount;
 
 		list_insert_tail(&arc_evict_waiters, &aw);
 
 		arc_set_need_free();
 
 		DTRACE_PROBE3(arc__wait__for__eviction,
 		    uint64_t, amount,
 		    uint64_t, arc_evict_count,
 		    uint64_t, aw.aew_count);
 
 		/*
 		 * We will be woken up either when arc_evict_count reaches
 		 * aew_count, or when the ARC is no longer overflowing and
 		 * eviction completes.
 		 * In case of "false" wakeup, we will still be on the list.
 		 */
 		do {
 			cv_wait(&aw.aew_cv, &arc_evict_lock);
 		} while (list_link_active(&aw.aew_node));
 		mutex_exit(&arc_evict_lock);
 
 		cv_destroy(&aw.aew_cv);
 	}
 	}
 }
 
 /*
  * Allocate a block and return it to the caller. If we are hitting the
  * hard limit for the cache size, we must sleep, waiting for the eviction
  * thread to catch up. If we're past the target size but below the hard
  * limit, we'll only signal the reclaim thread and continue on.
  */
 static void
 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag,
     int alloc_flags)
 {
 	arc_adapt(size);
 
 	/*
 	 * If arc_size is currently overflowing, we must be adding data
 	 * faster than we are evicting.  To ensure we don't compound the
 	 * problem by adding more data and forcing arc_size to grow even
 	 * further past it's target size, we wait for the eviction thread to
 	 * make some progress.  We also wait for there to be sufficient free
 	 * memory in the system, as measured by arc_free_memory().
 	 *
 	 * Specifically, we wait for zfs_arc_eviction_pct percent of the
 	 * requested size to be evicted.  This should be more than 100%, to
 	 * ensure that that progress is also made towards getting arc_size
 	 * under arc_c.  See the comment above zfs_arc_eviction_pct.
 	 */
 	arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100,
 	    B_TRUE, alloc_flags & ARC_HDR_USE_RESERVE);
 
 	arc_buf_contents_t type = arc_buf_type(hdr);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_consume(size, ARC_SPACE_META);
 	} else {
 		arc_space_consume(size, ARC_SPACE_DATA);
 	}
 
 	/*
 	 * Update the state size.  Note that ghost states have a
 	 * "ghost size" and so don't need to be updated.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	if (!GHOST_STATE(state)) {
 
 		(void) zfs_refcount_add_many(&state->arcs_size[type], size,
 		    tag);
 
 		/*
 		 * If this is reached via arc_read, the link is
 		 * protected by the hash lock. If reached via
 		 * arc_buf_alloc, the header should not be accessed by
 		 * any other thread. And, if reached via arc_read_done,
 		 * the hash lock will protect it if it's found in the
 		 * hash table; otherwise no other thread should be
 		 * trying to [add|remove]_reference it.
 		 */
 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
 			    size, tag);
 		}
 	}
 }
 
 static void
 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size,
     const void *tag)
 {
 	arc_free_data_impl(hdr, size, tag);
 	abd_free(abd);
 }
 
 static void
 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, const void *tag)
 {
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	arc_free_data_impl(hdr, size, tag);
 	if (type == ARC_BUFC_METADATA) {
 		zio_buf_free(buf, size);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		zio_data_buf_free(buf, size);
 	}
 }
 
 /*
  * Free the arc data buffer.
  */
 static void
 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, const void *tag)
 {
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	arc_buf_contents_t type = arc_buf_type(hdr);
 
 	/* protected by hash lock, if in the hash table */
 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
 		ASSERT(state != arc_anon && state != arc_l2c_only);
 
 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
 		    size, tag);
 	}
 	(void) zfs_refcount_remove_many(&state->arcs_size[type], size, tag);
 
 	VERIFY3U(hdr->b_type, ==, type);
 	if (type == ARC_BUFC_METADATA) {
 		arc_space_return(size, ARC_SPACE_META);
 	} else {
 		ASSERT(type == ARC_BUFC_DATA);
 		arc_space_return(size, ARC_SPACE_DATA);
 	}
 }
 
 /*
  * This routine is called whenever a buffer is accessed.
  */
 static void
 arc_access(arc_buf_hdr_t *hdr, arc_flags_t arc_flags, boolean_t hit)
 {
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * Update buffer prefetch status.
 	 */
 	boolean_t was_prefetch = HDR_PREFETCH(hdr);
 	boolean_t now_prefetch = arc_flags & ARC_FLAG_PREFETCH;
 	if (was_prefetch != now_prefetch) {
 		if (was_prefetch) {
 			ARCSTAT_CONDSTAT(hit, demand_hit, demand_iohit,
 			    HDR_PRESCIENT_PREFETCH(hdr), prescient, predictive,
 			    prefetch);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_decrement_state(hdr);
 		if (was_prefetch) {
 			arc_hdr_clear_flags(hdr,
 			    ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH);
 		} else {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
 		}
 		if (HDR_HAS_L2HDR(hdr))
 			l2arc_hdr_arcstats_increment_state(hdr);
 	}
 	if (now_prefetch) {
 		if (arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
 			ARCSTAT_BUMP(arcstat_prescient_prefetch);
 		} else {
 			ARCSTAT_BUMP(arcstat_predictive_prefetch);
 		}
 	}
 	if (arc_flags & ARC_FLAG_L2CACHE)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	clock_t now = ddi_get_lbolt();
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer is not in the cache, and does not appear in
 		 * our "ghost" lists.  Add it to the MRU or uncached state.
 		 */
 		ASSERT0(hdr->b_l1hdr.b_arc_access);
 		hdr->b_l1hdr.b_arc_access = now;
 		if (HDR_UNCACHED(hdr)) {
 			new_state = arc_uncached;
 			DTRACE_PROBE1(new_state__uncached, arc_buf_hdr_t *,
 			    hdr);
 		} else {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
 		/*
 		 * This buffer has been accessed once recently and either
 		 * its read is still in progress or it is in the cache.
 		 */
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 		hdr->b_l1hdr.b_mru_hits++;
 		ARCSTAT_BUMP(arcstat_mru_hits);
 
 		/*
 		 * If the previous access was a prefetch, then it already
 		 * handled possible promotion, so nothing more to do for now.
 		 */
 		if (was_prefetch) {
 			hdr->b_l1hdr.b_arc_access = now;
 			return;
 		}
 
 		/*
 		 * If more than ARC_MINTIME have passed from the previous
 		 * hit, promote the buffer to the MFU state.
 		 */
 		if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access +
 		    ARC_MINTIME)) {
 			hdr->b_l1hdr.b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 			arc_change_state(arc_mfu, hdr);
 		}
 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
 		arc_state_t	*new_state;
 		/*
 		 * This buffer has been accessed once recently, but was
 		 * evicted from the cache.  Would we have bigger MRU, it
 		 * would be an MRU hit, so handle it the same way, except
 		 * we don't need to check the previous access time.
 		 */
 		hdr->b_l1hdr.b_mru_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mru_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		if (was_prefetch) {
 			new_state = arc_mru;
 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		} else {
 			new_state = arc_mfu;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		}
 		arc_change_state(new_state, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
 		/*
 		 * This buffer has been accessed more than once and either
 		 * still in the cache or being restored from one of ghosts.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr)) {
 			hdr->b_l1hdr.b_mfu_hits++;
 			ARCSTAT_BUMP(arcstat_mfu_hits);
 		}
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
 		/*
 		 * This buffer has been accessed more than once recently, but
 		 * has been evicted from the cache.  Would we have bigger MFU
 		 * it would stay in cache, so move it back to MFU state.
 		 */
 		hdr->b_l1hdr.b_mfu_ghost_hits++;
 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 		wmsum_add(&arc_mfu_ghost->arcs_hits[arc_buf_type(hdr)],
 		    arc_hdr_size(hdr));
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mfu, hdr);
 	} else if (hdr->b_l1hdr.b_state == arc_uncached) {
 		/*
 		 * This buffer is uncacheable, but we got a hit.  Probably
 		 * a demand read after prefetch.  Nothing more to do here.
 		 */
 		if (!HDR_IO_IN_PROGRESS(hdr))
 			ARCSTAT_BUMP(arcstat_uncached_hits);
 		hdr->b_l1hdr.b_arc_access = now;
 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
 		/*
 		 * This buffer is on the 2nd Level ARC and was not accessed
 		 * for a long time, so treat it as new and put into MRU.
 		 */
 		hdr->b_l1hdr.b_arc_access = now;
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
 		arc_change_state(arc_mru, hdr);
 	} else {
 		cmn_err(CE_PANIC, "invalid arc state 0x%p",
 		    hdr->b_l1hdr.b_state);
 	}
 }
 
 /*
  * This routine is called by dbuf_hold() to update the arc_access() state
  * which otherwise would be skipped for entries in the dbuf cache.
  */
 void
 arc_buf_access(arc_buf_t *buf)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * Avoid taking the hash_lock when possible as an optimization.
 	 * The header must be checked again under the hash_lock in order
 	 * to handle the case where it is concurrently being released.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr))
 		return;
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_access_skip);
 		return;
 	}
 
 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 	    hdr->b_l1hdr.b_state == arc_mfu ||
 	    hdr->b_l1hdr.b_state == arc_uncached);
 
 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, 0, B_TRUE);
 	mutex_exit(hash_lock);
 
 	ARCSTAT_BUMP(arcstat_hits);
 	ARCSTAT_CONDSTAT(B_TRUE /* demand */, demand, prefetch,
 	    !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
 }
 
 /* a generic arc_read_done_func_t which you can use */
 void
 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zio, (void) zb, (void) bp;
 
 	if (buf == NULL)
 		return;
 
 	memcpy(arg, buf->b_data, arc_buf_size(buf));
 	arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
 void
 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *arg)
 {
 	(void) zb, (void) bp;
 	arc_buf_t **bufp = arg;
 
 	if (buf == NULL) {
 		ASSERT(zio == NULL || zio->io_error != 0);
 		*bufp = NULL;
 	} else {
 		ASSERT(zio == NULL || zio->io_error == 0);
 		*bufp = buf;
 		ASSERT(buf->b_data != NULL);
 	}
 }
 
 static void
 arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		ASSERT0(HDR_GET_PSIZE(hdr));
 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
 	} else {
 		if (HDR_COMPRESSION_ENABLED(hdr)) {
 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
 			    BP_GET_COMPRESS(bp));
 		}
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
 	}
 }
 
 static void
 arc_read_done(zio_t *zio)
 {
 	blkptr_t 	*bp = zio->io_bp;
 	arc_buf_hdr_t	*hdr = zio->io_private;
 	kmutex_t	*hash_lock = NULL;
 	arc_callback_t	*callback_list;
 	arc_callback_t	*acb;
 
 	/*
 	 * The hdr was inserted into hash-table and removed from lists
 	 * prior to starting I/O.  We should find this header, since
 	 * it's in the hash table, and it should be legit since it's
 	 * not possible to evict it during the I/O.  The only possible
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
 	if (HDR_IN_HASH_TABLE(hdr)) {
 		arc_buf_hdr_t *found;
 
 		ASSERT3U(hdr->b_birth, ==, BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
 
 		found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock);
 
 		ASSERT((found == hdr &&
 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
 		    (found == hdr && HDR_L2_READING(hdr)));
 		ASSERT3P(hash_lock, !=, NULL);
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 
 		if (zio->io_error == 0) {
 			if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
 				void *tmpbuf;
 
 				tmpbuf = abd_borrow_buf_copy(zio->io_abd,
 				    sizeof (zil_chain_t));
 				zio_crypt_decode_mac_zil(tmpbuf,
 				    hdr->b_crypt_hdr.b_mac);
 				abd_return_buf(zio->io_abd, tmpbuf,
 				    sizeof (zil_chain_t));
 			} else {
 				zio_crypt_decode_mac_bp(bp,
 				    hdr->b_crypt_hdr.b_mac);
 			}
 		}
 	}
 
 	if (zio->io_error == 0) {
 		/* byteswap if necessary */
 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 		if (!HDR_L2_READING(hdr)) {
 			hdr->b_complevel = zio->io_prop.zp_complevel;
 		}
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
 
 	callback_list = hdr->b_l1hdr.b_acb;
 	ASSERT3P(callback_list, !=, NULL);
 	hdr->b_l1hdr.b_acb = NULL;
 
 	/*
 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
 	 * make a buf containing the data according to the parameters which were
 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
 	 * aren't needlessly decompressing the data multiple times.
 	 */
 	int callback_cnt = 0;
 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
 
 		/* We need the last one to call below in original order. */
 		callback_list = acb;
 
 		if (!acb->acb_done || acb->acb_nobuf)
 			continue;
 
 		callback_cnt++;
 
 		if (zio->io_error != 0)
 			continue;
 
 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
 		    &acb->acb_buf);
 
 		/*
 		 * Assert non-speculative zios didn't fail because an
 		 * encryption key wasn't loaded
 		 */
 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
 		    error != EACCES);
 
 		/*
 		 * If we failed to decrypt, report an error now (as the zio
 		 * layer would have done if it had done the transforms).
 		 */
 		if (error == ECKSUM) {
 			ASSERT(BP_IS_PROTECTED(bp));
 			error = SET_ERROR(EIO);
 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 				spa_log_error(zio->io_spa, &acb->acb_zb,
 				    BP_GET_PHYSICAL_BIRTH(zio->io_bp));
 				(void) zfs_ereport_post(
 				    FM_EREPORT_ZFS_AUTHENTICATION,
 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0);
 			}
 		}
 
 		if (error != 0) {
 			/*
 			 * Decompression or decryption failed.  Set
 			 * io_error so that when we call acb_done
 			 * (below), we will indicate that the read
 			 * failed. Note that in the unusual case
 			 * where one callback is compressed and another
 			 * uncompressed, we will mark all of them
 			 * as failed, even though the uncompressed
 			 * one can't actually fail.  In this case,
 			 * the hdr will not be anonymous, because
 			 * if there are multiple callbacks, it's
 			 * because multiple threads found the same
 			 * arc buf in the hash table.
 			 */
 			zio->io_error = error;
 		}
 	}
 
 	/*
 	 * If there are multiple callbacks, we must have the hash lock,
 	 * because the only way for multiple threads to find this hdr is
 	 * in the hash table.  This ensures that if there are multiple
 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
 	 * we couldn't use arc_buf_destroy() in the error case below.
 	 */
 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
 		if (hdr->b_l1hdr.b_state != arc_anon)
 			arc_change_state(arc_anon, hdr);
 		if (HDR_IN_HASH_TABLE(hdr))
 			buf_hash_remove(hdr);
 	}
 
 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 	(void) remove_reference(hdr, hdr);
 
 	if (hash_lock != NULL)
 		mutex_exit(hash_lock);
 
 	/* execute each callback and free its structure */
 	while ((acb = callback_list) != NULL) {
 		if (acb->acb_done != NULL) {
 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
 				/*
 				 * If arc_buf_alloc_impl() fails during
 				 * decompression, the buf will still be
 				 * allocated, and needs to be freed here.
 				 */
 				arc_buf_destroy(acb->acb_buf,
 				    acb->acb_private);
 				acb->acb_buf = NULL;
 			}
 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
 			    acb->acb_buf, acb->acb_private);
 		}
 
 		if (acb->acb_zio_dummy != NULL) {
 			acb->acb_zio_dummy->io_error = zio->io_error;
 			zio_nowait(acb->acb_zio_dummy);
 		}
 
 		callback_list = acb->acb_prev;
 		if (acb->acb_wait) {
 			mutex_enter(&acb->acb_wait_lock);
 			acb->acb_wait_error = zio->io_error;
 			acb->acb_wait = B_FALSE;
 			cv_signal(&acb->acb_wait_cv);
 			mutex_exit(&acb->acb_wait_lock);
 			/* acb will be freed by the waiting thread. */
 		} else {
 			kmem_free(acb, sizeof (arc_callback_t));
 		}
 	}
 }
 
 /*
  * Lookup the block at the specified DVA (in bp), and return the manner in
  * which the block is cached. A zero return indicates not cached.
  */
 int
 arc_cached(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	uint64_t guid = spa_load_guid(spa);
 	int flags = 0;
 
 	if (BP_IS_EMBEDDED(bp))
 		return (ARC_CACHED_EMBEDDED);
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return (0);
 
 	if (HDR_HAS_L1HDR(hdr)) {
 		arc_state_t *state = hdr->b_l1hdr.b_state;
 		/*
 		 * We switch to ensure that any future arc_state_type_t
 		 * changes are handled. This is just a shift to promote
 		 * more compile-time checking.
 		 */
 		switch (state->arcs_state) {
 		case ARC_STATE_ANON:
 			break;
 		case ARC_STATE_MRU:
 			flags |= ARC_CACHED_IN_MRU | ARC_CACHED_IN_L1;
 			break;
 		case ARC_STATE_MFU:
 			flags |= ARC_CACHED_IN_MFU | ARC_CACHED_IN_L1;
 			break;
 		case ARC_STATE_UNCACHED:
 			/* The header is still in L1, probably not for long */
 			flags |= ARC_CACHED_IN_L1;
 			break;
 		default:
 			break;
 		}
 	}
 	if (HDR_HAS_L2HDR(hdr))
 		flags |= ARC_CACHED_IN_L2;
 
 	mutex_exit(hash_lock);
 
 	return (flags);
 }
 
 /*
  * "Read" the block at the specified DVA (in bp) via the
  * cache.  If the block is found in the cache, invoke the provided
  * callback immediately and return.  Note that the `zio' parameter
  * in the callback will be NULL in this case, since no IO was
  * required.  If the block is not in the cache pass the read request
  * on to the spa with a substitute callback function, so that the
  * requested block will be added to the cache.
  *
  * If a read request arrives for a block that has a read in-progress,
  * either wait for the in-progress read to complete (and return the
  * results); or, if this is a read with a "done" func, add a record
  * to the read to invoke the "done" func when the read completes,
  * and return; or just return.
  *
  * arc_read_done() will invoke all the requested "done" functions
  * for readers of this block.
  */
 int
 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_read_done_func_t *done, void *private, zio_priority_t priority,
     int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = NULL;
 	kmutex_t *hash_lock = NULL;
 	zio_t *rzio;
 	uint64_t guid = spa_load_guid(spa);
 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
 	boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp);
 	boolean_t no_buf = *arc_flags & ARC_FLAG_NO_BUF;
 	arc_buf_t *buf = NULL;
 	int rc = 0;
 	boolean_t bp_validation = B_FALSE;
 
 	ASSERT(!embedded_bp ||
 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * Normally SPL_FSTRANS will already be set since kernel threads which
 	 * expect to call the DMU interfaces will set it when created.  System
 	 * calls are similarly handled by setting/cleaning the bit in the
 	 * registered callback (module/os/.../zfs/zpl_*).
 	 *
 	 * External consumers such as Lustre which call the exported DMU
 	 * interfaces may not have set SPL_FSTRANS.  To avoid a deadlock
 	 * on the hash_lock always set and clear the bit.
 	 */
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 top:
 	if (!embedded_bp) {
 		/*
 		 * Embedded BP's have no DVA and require no I/O to "read".
 		 * Create an anonymous arc buf to back it.
 		 */
 		hdr = buf_hash_find(guid, bp, &hash_lock);
 	}
 
 	/*
 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
 	 * we maintain encrypted data separately from compressed / uncompressed
 	 * data. If the user is requesting raw encrypted data and we don't have
 	 * that in the header we will read from disk to guarantee that we can
 	 * get it even if the encryption keys aren't loaded.
 	 */
 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
 		boolean_t is_data = !HDR_ISTYPE_METADATA(hdr);
 
 		/*
 		 * Verify the block pointer contents are reasonable.  This
 		 * should always be the case since the blkptr is protected by
 		 * a checksum.
 		 */
 		if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_SKIP,
 		    BLK_VERIFY_LOG)) {
 			mutex_exit(hash_lock);
 			rc = SET_ERROR(ECKSUM);
 			goto done;
 		}
 
 		if (HDR_IO_IN_PROGRESS(hdr)) {
 			if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 				mutex_exit(hash_lock);
 				ARCSTAT_BUMP(arcstat_cached_only_in_progress);
 				rc = SET_ERROR(ENOENT);
 				goto done;
 			}
 
 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
 			ASSERT3P(head_zio, !=, NULL);
 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
 			    priority == ZIO_PRIORITY_SYNC_READ) {
 				/*
 				 * This is a sync read that needs to wait for
 				 * an in-flight async read. Request that the
 				 * zio have its priority upgraded.
 				 */
 				zio_change_priority(head_zio, priority);
 				DTRACE_PROBE1(arc__async__upgrade__sync,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
 			}
 
 			DTRACE_PROBE1(arc__iohit, arc_buf_hdr_t *, hdr);
 			arc_access(hdr, *arc_flags, B_FALSE);
 
 			/*
 			 * If there are multiple threads reading the same block
 			 * and that block is not yet in the ARC, then only one
 			 * thread will do the physical I/O and all other
 			 * threads will wait until that I/O completes.
 			 * Synchronous reads use the acb_wait_cv whereas nowait
 			 * reads register a callback. Both are signalled/called
 			 * in arc_read_done.
 			 *
 			 * Errors of the physical I/O may need to be propagated.
 			 * Synchronous read errors are returned here from
 			 * arc_read_done via acb_wait_error.  Nowait reads
 			 * attach the acb_zio_dummy zio to pio and
 			 * arc_read_done propagates the physical I/O's io_error
 			 * to acb_zio_dummy, and thereby to pio.
 			 */
 			arc_callback_t *acb = NULL;
 			if (done || pio || *arc_flags & ARC_FLAG_WAIT) {
 				acb = kmem_zalloc(sizeof (arc_callback_t),
 				    KM_SLEEP);
 				acb->acb_done = done;
 				acb->acb_private = private;
 				acb->acb_compressed = compressed_read;
 				acb->acb_encrypted = encrypted_read;
 				acb->acb_noauth = noauth_read;
 				acb->acb_nobuf = no_buf;
 				if (*arc_flags & ARC_FLAG_WAIT) {
 					acb->acb_wait = B_TRUE;
 					mutex_init(&acb->acb_wait_lock, NULL,
 					    MUTEX_DEFAULT, NULL);
 					cv_init(&acb->acb_wait_cv, NULL,
 					    CV_DEFAULT, NULL);
 				}
 				acb->acb_zb = *zb;
 				if (pio != NULL) {
 					acb->acb_zio_dummy = zio_null(pio,
 					    spa, NULL, NULL, NULL, zio_flags);
 				}
 				acb->acb_zio_head = head_zio;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 			}
 			mutex_exit(hash_lock);
 
 			ARCSTAT_BUMP(arcstat_iohits);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, is_data, data, metadata, iohits);
 
 			if (*arc_flags & ARC_FLAG_WAIT) {
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				rc = acb->acb_wait_error;
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 			}
 			goto out;
 		}
 
 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
 		    hdr->b_l1hdr.b_state == arc_mfu ||
 		    hdr->b_l1hdr.b_state == arc_uncached);
 
 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 		arc_access(hdr, *arc_flags, B_TRUE);
 
 		if (done && !no_buf) {
 			ASSERT(!embedded_bp || !BP_IS_HOLE(bp));
 
 			/* Get a buf with the desired data in it. */
 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
 			    encrypted_read, compressed_read, noauth_read,
 			    B_TRUE, &buf);
 			if (rc == ECKSUM) {
 				/*
 				 * Convert authentication and decryption errors
 				 * to EIO (and generate an ereport if needed)
 				 * before leaving the ARC.
 				 */
 				rc = SET_ERROR(EIO);
 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
 					spa_log_error(spa, zb, hdr->b_birth);
 					(void) zfs_ereport_post(
 					    FM_EREPORT_ZFS_AUTHENTICATION,
 					    spa, NULL, zb, NULL, 0);
 				}
 			}
 			if (rc != 0) {
 				arc_buf_destroy_impl(buf);
 				buf = NULL;
 				(void) remove_reference(hdr, private);
 			}
 
 			/* assert any errors weren't due to unloaded keys */
 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
 			    rc != EACCES);
 		}
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 		    demand, prefetch, is_data, data, metadata, hits);
 		*arc_flags |= ARC_FLAG_CACHED;
 		goto done;
 	} else {
 		uint64_t lsize = BP_GET_LSIZE(bp);
 		uint64_t psize = BP_GET_PSIZE(bp);
 		arc_callback_t *acb;
 		vdev_t *vd = NULL;
 		uint64_t addr = 0;
 		boolean_t devw = B_FALSE;
 		uint64_t size;
 		abd_t *hdr_abd;
 		int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0;
 		arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
 		int config_lock;
 		int error;
 
 		if (*arc_flags & ARC_FLAG_CACHED_ONLY) {
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			rc = SET_ERROR(ENOENT);
 			goto done;
 		}
 
 		if (zio_flags & ZIO_FLAG_CONFIG_WRITER) {
 			config_lock = BLK_CONFIG_HELD;
 		} else if (hash_lock != NULL) {
 			/*
 			 * Prevent lock order reversal
 			 */
 			config_lock = BLK_CONFIG_NEEDED_TRY;
 		} else {
 			config_lock = BLK_CONFIG_NEEDED;
 		}
 
 		/*
 		 * Verify the block pointer contents are reasonable.  This
 		 * should always be the case since the blkptr is protected by
 		 * a checksum.
 		 */
 		if (!bp_validation && (error = zfs_blkptr_verify(spa, bp,
 		    config_lock, BLK_VERIFY_LOG))) {
 			if (hash_lock != NULL)
 				mutex_exit(hash_lock);
 			if (error == EBUSY && !zfs_blkptr_verify(spa, bp,
 			    BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 				bp_validation = B_TRUE;
 				goto top;
 			}
 			rc = SET_ERROR(ECKSUM);
 			goto done;
 		}
 
 		if (hdr == NULL) {
 			/*
 			 * This block is not in the cache or it has
 			 * embedded data.
 			 */
 			arc_buf_hdr_t *exists = NULL;
 			hdr = arc_hdr_alloc(guid, psize, lsize,
 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type);
 
 			if (!embedded_bp) {
 				hdr->b_dva = *BP_IDENTITY(bp);
 				hdr->b_birth = BP_GET_PHYSICAL_BIRTH(bp);
 				exists = buf_hash_insert(hdr, &hash_lock);
 			}
 			if (exists != NULL) {
 				/* somebody beat us to the hash insert */
 				mutex_exit(hash_lock);
 				buf_discard_identity(hdr);
 				arc_hdr_destroy(hdr);
 				goto top; /* restart the IO request */
 			}
 		} else {
 			/*
 			 * This block is in the ghost cache or encrypted data
 			 * was requested and we didn't have it. If it was
 			 * L2-only (and thus didn't have an L1 hdr),
 			 * we realloc the header to add an L1 hdr.
 			 */
 			if (!HDR_HAS_L1HDR(hdr)) {
 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
 				    hdr_full_cache);
 			}
 
 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
 				ASSERT0P(hdr->b_l1hdr.b_pabd);
 				ASSERT(!HDR_HAS_RABD(hdr));
 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 				ASSERT0(zfs_refcount_count(
 				    &hdr->b_l1hdr.b_refcnt));
 				ASSERT0P(hdr->b_l1hdr.b_buf);
 #ifdef ZFS_DEBUG
 				ASSERT0P(hdr->b_l1hdr.b_freeze_cksum);
 #endif
 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
 				/*
 				 * If this header already had an IO in progress
 				 * and we are performing another IO to fetch
 				 * encrypted data we must wait until the first
 				 * IO completes so as not to confuse
 				 * arc_read_done(). This should be very rare
 				 * and so the performance impact shouldn't
 				 * matter.
 				 */
 				arc_callback_t *acb = kmem_zalloc(
 				    sizeof (arc_callback_t), KM_SLEEP);
 				acb->acb_wait = B_TRUE;
 				mutex_init(&acb->acb_wait_lock, NULL,
 				    MUTEX_DEFAULT, NULL);
 				cv_init(&acb->acb_wait_cv, NULL, CV_DEFAULT,
 				    NULL);
 				acb->acb_zio_head =
 				    hdr->b_l1hdr.b_acb->acb_zio_head;
 				acb->acb_next = hdr->b_l1hdr.b_acb;
 				hdr->b_l1hdr.b_acb->acb_prev = acb;
 				hdr->b_l1hdr.b_acb = acb;
 				mutex_exit(hash_lock);
 				mutex_enter(&acb->acb_wait_lock);
 				while (acb->acb_wait) {
 					cv_wait(&acb->acb_wait_cv,
 					    &acb->acb_wait_lock);
 				}
 				mutex_exit(&acb->acb_wait_lock);
 				mutex_destroy(&acb->acb_wait_lock);
 				cv_destroy(&acb->acb_wait_cv);
 				kmem_free(acb, sizeof (arc_callback_t));
 				goto top;
 			}
 		}
 		if (*arc_flags & ARC_FLAG_UNCACHED) {
 			arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 			if (!encrypted_read)
 				alloc_flags |= ARC_HDR_ALLOC_LINEAR;
 		}
 
 		/*
 		 * Take additional reference for IO_IN_PROGRESS.  It stops
 		 * arc_access() from putting this header without any buffers
 		 * and so other references but obviously nonevictable onto
 		 * the evictable list of MRU or MFU state.
 		 */
 		add_reference(hdr, hdr);
 		if (!embedded_bp)
 			arc_access(hdr, *arc_flags, B_FALSE);
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		arc_hdr_alloc_abd(hdr, alloc_flags);
 		if (encrypted_read) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			size = HDR_GET_PSIZE(hdr);
 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
 			zio_flags |= ZIO_FLAG_RAW;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			size = arc_hdr_size(hdr);
 			hdr_abd = hdr->b_l1hdr.b_pabd;
 
 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 			}
 
 			/*
 			 * For authenticated bp's, we do not ask the ZIO layer
 			 * to authenticate them since this will cause the entire
 			 * IO to fail if the key isn't loaded. Instead, we
 			 * defer authentication until arc_buf_fill(), which will
 			 * verify the data when the key is available.
 			 */
 			if (BP_IS_AUTHENTICATED(bp))
 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
 		}
 
 		if (BP_IS_AUTHENTICATED(bp))
 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		if (BP_GET_LEVEL(bp) > 0)
 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
 		acb->acb_done = done;
 		acb->acb_private = private;
 		acb->acb_compressed = compressed_read;
 		acb->acb_encrypted = encrypted_read;
 		acb->acb_noauth = noauth_read;
 		acb->acb_nobuf = no_buf;
 		acb->acb_zb = *zb;
 
 		ASSERT0P(hdr->b_l1hdr.b_acb);
 		hdr->b_l1hdr.b_acb = acb;
 
 		if (HDR_HAS_L2HDR(hdr) &&
 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr.b_daddr;
 			/*
 			 * Lock out L2ARC device removal.
 			 */
 			if (vdev_is_dead(vd) ||
 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
 				vd = NULL;
 		}
 
 		/*
 		 * We count both async reads and scrub IOs as asynchronous so
 		 * that both can be upgraded in the event of a cache hit while
 		 * the read IO is still in-flight.
 		 */
 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
 		    priority == ZIO_PRIORITY_SCRUB)
 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 		else
 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
 
 		/*
 		 * At this point, we have a level 1 cache miss or a blkptr
 		 * with embedded data.  Try again in L2ARC if possible.
 		 */
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
 
 		/*
 		 * Skip ARC stat bump for block pointers with embedded
 		 * data. The data are read from the blkptr itself via
 		 * decode_embedded_bp_compressed().
 		 */
 		if (!embedded_bp) {
 			DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr,
 			    blkptr_t *, bp, uint64_t, lsize,
 			    zbookmark_phys_t *, zb);
 			ARCSTAT_BUMP(arcstat_misses);
 			ARCSTAT_CONDSTAT(!(*arc_flags & ARC_FLAG_PREFETCH),
 			    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data,
 			    metadata, misses);
 			zfs_racct_read(spa, size, 1,
 			    (*arc_flags & ARC_FLAG_UNCACHED) ?
 			    DMU_UNCACHEDIO : 0);
 		}
 
 		/* Check if the spa even has l2 configured */
 		const boolean_t spa_has_l2 = l2arc_ndev != 0 &&
 		    spa->spa_l2cache.sav_count > 0;
 
 		if (vd != NULL && spa_has_l2 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
 			 * 2. This buffer still has L2ARC metadata.
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
 			 */
 			if (HDR_HAS_L2HDR(hdr) &&
 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
 				l2arc_read_callback_t *cb;
 				abd_t *abd;
 				uint64_t asize;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_hits);
 				hdr->b_l2hdr.b_hits++;
 
 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
 				    KM_SLEEP);
 				cb->l2rcb_hdr = hdr;
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
 
 				/*
 				 * When Compressed ARC is disabled, but the
 				 * L2ARC block is compressed, arc_hdr_size()
 				 * will have returned LSIZE rather than PSIZE.
 				 */
 				if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 				    !HDR_COMPRESSION_ENABLED(hdr) &&
 				    HDR_GET_PSIZE(hdr) != 0) {
 					size = HDR_GET_PSIZE(hdr);
 				}
 
 				asize = vdev_psize_to_asize(vd, size);
 				if (asize != size) {
 					abd = abd_alloc_for_io(asize,
 					    HDR_ISTYPE_METADATA(hdr));
 					cb->l2rcb_abd = abd;
 				} else {
 					abd = hdr_abd;
 				}
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + asize <= vd->vdev_psize -
 				    VDEV_LABEL_END_SIZE);
 
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
 				 * Issue a null zio if the underlying buffer
 				 * was squashed to zero size by compression.
 				 */
 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
 				    ZIO_COMPRESS_EMPTY);
 				rzio = zio_read_phys(pio, vd, addr,
 				    asize, abd,
 				    ZIO_CHECKSUM_OFF,
 				    l2arc_read_done, cb, priority,
 				    zio_flags | ZIO_FLAG_CANFAIL |
 				    ZIO_FLAG_DONT_PROPAGATE |
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				acb->acb_zio_head = rzio;
 
 				if (hash_lock != NULL)
 					mutex_exit(hash_lock);
 
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
 				ARCSTAT_INCR(arcstat_l2_read_bytes,
 				    HDR_GET_PSIZE(hdr));
 
 				if (*arc_flags & ARC_FLAG_NOWAIT) {
 					zio_nowait(rzio);
 					goto out;
 				}
 
 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
 				if (zio_wait(rzio) == 0)
 					goto out;
 
 				/* l2arc read error; goto zio_read() */
 				if (hash_lock != NULL)
 					mutex_enter(hash_lock);
 			} else {
 				DTRACE_PROBE1(l2arc__miss,
 				    arc_buf_hdr_t *, hdr);
 				ARCSTAT_BUMP(arcstat_l2_misses);
 				if (HDR_L2_WRITING(hdr))
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
 		} else {
 			if (vd != NULL)
 				spa_config_exit(spa, SCL_L2ARC, vd);
 
 			/*
 			 * Only a spa with l2 should contribute to l2
 			 * miss stats.  (Including the case of having a
 			 * faulted cache device - that's also a miss.)
 			 */
 			if (spa_has_l2) {
 				/*
 				 * Skip ARC stat bump for block pointers with
 				 * embedded data. The data are read from the
 				 * blkptr itself via
 				 * decode_embedded_bp_compressed().
 				 */
 				if (!embedded_bp) {
 					DTRACE_PROBE1(l2arc__miss,
 					    arc_buf_hdr_t *, hdr);
 					ARCSTAT_BUMP(arcstat_l2_misses);
 				}
 			}
 		}
 
 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
 		    arc_read_done, hdr, priority, zio_flags, zb);
 		acb->acb_zio_head = rzio;
 
 		if (hash_lock != NULL)
 			mutex_exit(hash_lock);
 
 		if (*arc_flags & ARC_FLAG_WAIT) {
 			rc = zio_wait(rzio);
 			goto out;
 		}
 
 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
 		zio_nowait(rzio);
 	}
 
 out:
 	/* embedded bps don't actually go to disk */
 	if (!embedded_bp)
 		spa_read_history_add(spa, zb, *arc_flags);
 	spl_fstrans_unmark(cookie);
 	return (rc);
 
 done:
 	if (done)
 		done(NULL, zb, bp, buf, private);
 	if (pio && rc != 0) {
 		zio_t *zio = zio_null(pio, spa, NULL, NULL, NULL, zio_flags);
 		zio->io_error = rc;
 		zio_nowait(zio);
 	}
 	goto out;
 }
 
 arc_prune_t *
 arc_add_prune_callback(arc_prune_func_t *func, void *private)
 {
 	arc_prune_t *p;
 
 	p = kmem_alloc(sizeof (*p), KM_SLEEP);
 	p->p_pfunc = func;
 	p->p_private = private;
 	list_link_init(&p->p_node);
 	zfs_refcount_create(&p->p_refcnt);
 
 	mutex_enter(&arc_prune_mtx);
 	zfs_refcount_add(&p->p_refcnt, &arc_prune_list);
 	list_insert_head(&arc_prune_list, p);
 	mutex_exit(&arc_prune_mtx);
 
 	return (p);
 }
 
 void
 arc_remove_prune_callback(arc_prune_t *p)
 {
 	boolean_t wait = B_FALSE;
 	mutex_enter(&arc_prune_mtx);
 	list_remove(&arc_prune_list, p);
 	if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0)
 		wait = B_TRUE;
 	mutex_exit(&arc_prune_mtx);
 
 	/* wait for arc_prune_task to finish */
 	if (wait)
 		taskq_wait_outstanding(arc_prune_taskq, 0);
 	ASSERT0(zfs_refcount_count(&p->p_refcnt));
 	zfs_refcount_destroy(&p->p_refcnt);
 	kmem_free(p, sizeof (*p));
 }
 
 /*
  * Helper function for arc_prune_async() it is responsible for safely
  * handling the execution of a registered arc_prune_func_t.
  */
 static void
 arc_prune_task(void *ptr)
 {
 	arc_prune_t *ap = (arc_prune_t *)ptr;
 	arc_prune_func_t *func = ap->p_pfunc;
 
 	if (func != NULL)
 		func(ap->p_adjust, ap->p_private);
 
 	(void) zfs_refcount_remove(&ap->p_refcnt, func);
 }
 
 /*
  * Notify registered consumers they must drop holds on a portion of the ARC
  * buffers they reference.  This provides a mechanism to ensure the ARC can
  * honor the metadata limit and reclaim otherwise pinned ARC buffers.
  *
  * This operation is performed asynchronously so it may be safely called
  * in the context of the arc_reclaim_thread().  A reference is taken here
  * for each registered arc_prune_t and the arc_prune_task() is responsible
  * for releasing it once the registered arc_prune_func_t has completed.
  */
 static void
 arc_prune_async(uint64_t adjust)
 {
 	arc_prune_t *ap;
 
 	mutex_enter(&arc_prune_mtx);
 	for (ap = list_head(&arc_prune_list); ap != NULL;
 	    ap = list_next(&arc_prune_list, ap)) {
 
 		if (zfs_refcount_count(&ap->p_refcnt) >= 2)
 			continue;
 
 		zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc);
 		ap->p_adjust = adjust;
 		if (taskq_dispatch(arc_prune_taskq, arc_prune_task,
 		    ap, TQ_SLEEP) == TASKQID_INVALID) {
 			(void) zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc);
 			continue;
 		}
 		ARCSTAT_BUMP(arcstat_prune);
 	}
 	mutex_exit(&arc_prune_mtx);
 }
 
 /*
  * Notify the arc that a block was freed, and thus will never be used again.
  */
 void
 arc_freed(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	uint64_t guid = spa_load_guid(spa);
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 
 	hdr = buf_hash_find(guid, bp, &hash_lock);
 	if (hdr == NULL)
 		return;
 
 	/*
 	 * We might be trying to free a block that is still doing I/O
 	 * (i.e. prefetch) or has some other reference (i.e. a dedup-ed,
 	 * dmu_sync-ed block). A block may also have a reference if it is
 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
 	 * have written the new block to its final resting place on disk but
 	 * without the dedup flag set. This would have left the hdr in the MRU
 	 * state and discoverable. When the txg finally syncs it detects that
 	 * the block was overridden in open context and issues an override I/O.
 	 * Since this is a dedup block, the override I/O will determine if the
 	 * block is already in the DDT. If so, then it will replace the io_bp
 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
 	 * reaches the done callback, dbuf_write_override_done, it will
 	 * check to see if the io_bp and io_bp_override are identical.
 	 * If they are not, then it indicates that the bp was replaced with
 	 * the bp in the DDT and the override bp is freed. This allows
 	 * us to arrive here with a reference on a block that is being
 	 * freed. So if we have an I/O in progress, or a reference to
 	 * this hdr, then we don't destroy the hdr.
 	 */
 	if (!HDR_HAS_L1HDR(hdr) ||
 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
 		arc_change_state(arc_anon, hdr);
 		arc_hdr_destroy(hdr);
 		mutex_exit(hash_lock);
 	} else {
 		mutex_exit(hash_lock);
 	}
 
 }
 
 /*
  * Release this buffer from the cache, making it an anonymous buffer.  This
  * must be done after a read and prior to modifying the buffer contents.
  * If the buffer has more than one reference, we must make
  * a new hdr for the buffer.
  */
 void
 arc_release(arc_buf_t *buf, const void *tag)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	/*
 	 * It would be nice to assert that if its DMU metadata (level >
 	 * 0 || it's the dnode file), then it must be syncing context.
 	 * But we don't know that information at this level.
 	 */
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 
 	/*
 	 * We don't grab the hash lock prior to this check, because if
 	 * the buffer's header is in the arc_anon state, it won't be
 	 * linked into the hash table.
 	 */
 	if (hdr->b_l1hdr.b_state == arc_anon) {
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
 		ASSERT(!HDR_HAS_L2HDR(hdr));
 
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		/*
 		 * If the buf is being overridden then it may already
 		 * have a hdr that is not empty.
 		 */
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 
 		return;
 	}
 
 	kmutex_t *hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
 	/*
 	 * This assignment is only valid as long as the hash_lock is
 	 * held, we must be careful not to reference state or the
 	 * b_state field after dropping the lock.
 	 */
 	arc_state_t *state = hdr->b_l1hdr.b_state;
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 	ASSERT3P(state, !=, arc_anon);
 	ASSERT3P(state, !=, arc_l2c_only);
 
 	/* this buffer is not on any list */
 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
 
 	/*
 	 * Do we have more than one buf?
 	 */
 	if (hdr->b_l1hdr.b_buf != buf || !ARC_BUF_LAST(buf)) {
 		arc_buf_hdr_t *nhdr;
 		uint64_t spa = hdr->b_spa;
 		uint64_t psize = HDR_GET_PSIZE(hdr);
 		uint64_t lsize = HDR_GET_LSIZE(hdr);
 		boolean_t protected = HDR_PROTECTED(hdr);
 		enum zio_compress compress = arc_hdr_get_compress(hdr);
 		arc_buf_contents_t type = arc_buf_type(hdr);
 
 		if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
 			ASSERT(ARC_BUF_LAST(buf));
 		}
 
 		/*
 		 * Pull the buffer off of this hdr and find the last buffer
 		 * in the hdr's buffer list.
 		 */
 		VERIFY3S(remove_reference(hdr, tag), >, 0);
 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
 		ASSERT3P(lastbuf, !=, NULL);
 
 		/*
 		 * If the current arc_buf_t and the hdr are sharing their data
 		 * buffer, then we must stop sharing that block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			ASSERT(!arc_buf_is_shared(lastbuf));
 
 			/*
 			 * First, sever the block sharing relationship between
 			 * buf and the arc_buf_hdr_t.
 			 */
 			arc_unshare_buf(hdr, buf);
 
 			/*
 			 * Now we need to recreate the hdr's b_pabd. Since we
 			 * have lastbuf handy, we try to share with it, but if
 			 * we can't then we allocate a new b_pabd and copy the
 			 * data from buf into it.
 			 */
 			if (arc_can_share(hdr, lastbuf)) {
 				arc_share_buf(hdr, lastbuf);
 			} else {
 				arc_hdr_alloc_abd(hdr, 0);
 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
 				    buf->b_data, psize);
 			}
 		} else if (HDR_SHARED_DATA(hdr)) {
 			/*
 			 * Uncompressed shared buffers are always at the end
 			 * of the list. Compressed buffers don't have the
 			 * same requirements. This makes it hard to
 			 * simply assert that the lastbuf is shared so
 			 * we rely on the hdr's compression flags to determine
 			 * if we have a compressed, shared buffer.
 			 */
 			ASSERT(arc_buf_is_shared(lastbuf) ||
 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
 			ASSERT(!arc_buf_is_shared(buf));
 		}
 
 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
 
 		(void) zfs_refcount_remove_many(&state->arcs_size[type],
 		    arc_buf_size(buf), buf);
 
 		arc_cksum_verify(buf);
 		arc_buf_unwatch(buf);
 
 		/* if this is the last uncompressed buf free the checksum */
 		if (!arc_hdr_has_uncompressed_buf(hdr))
 			arc_cksum_free(hdr);
 
 		mutex_exit(hash_lock);
 
 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
 		    compress, hdr->b_complevel, type);
 		ASSERT0P(nhdr->b_l1hdr.b_buf);
 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
 		VERIFY3U(nhdr->b_type, ==, type);
 		ASSERT(!HDR_SHARED_DATA(nhdr));
 
 		nhdr->b_l1hdr.b_buf = buf;
 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
 		buf->b_hdr = nhdr;
 
 		(void) zfs_refcount_add_many(&arc_anon->arcs_size[type],
 		    arc_buf_size(buf), buf);
 	} else {
 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
 		/* protected by hash lock, or hdr is on arc_anon */
 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 
 		if (HDR_HAS_L2HDR(hdr)) {
 			mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 			/* Recheck to prevent race with l2arc_evict(). */
 			if (HDR_HAS_L2HDR(hdr))
 				arc_hdr_l2hdr_destroy(hdr);
 			mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
 		}
 
 		hdr->b_l1hdr.b_mru_hits = 0;
 		hdr->b_l1hdr.b_mru_ghost_hits = 0;
 		hdr->b_l1hdr.b_mfu_hits = 0;
 		hdr->b_l1hdr.b_mfu_ghost_hits = 0;
 		arc_change_state(arc_anon, hdr);
 		hdr->b_l1hdr.b_arc_access = 0;
 
 		mutex_exit(hash_lock);
 		buf_discard_identity(hdr);
 		arc_buf_thaw(buf);
 	}
 }
 
 int
 arc_released(arc_buf_t *buf)
 {
 	return (buf->b_data != NULL &&
 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
 }
 
 #ifdef ZFS_DEBUG
 int
 arc_referenced(arc_buf_t *buf)
 {
 	return (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
 }
 #endif
 
 static void
 arc_write_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 
 	ASSERT(HDR_HAS_L1HDR(hdr));
 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 
 	/*
 	 * If we're reexecuting this zio because the pool suspended, then
 	 * cleanup any state that was previously set the first time the
 	 * callback was invoked.
 	 */
 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
 		arc_cksum_free(hdr);
 		arc_buf_unwatch(buf);
 		if (hdr->b_l1hdr.b_pabd != NULL) {
 			if (ARC_BUF_SHARED(buf)) {
 				arc_unshare_buf(hdr, buf);
 			} else {
 				ASSERT(!arc_buf_is_shared(buf));
 				arc_hdr_free_abd(hdr, B_FALSE);
 			}
 		}
 
 		if (HDR_HAS_RABD(hdr))
 			arc_hdr_free_abd(hdr, B_TRUE);
 	}
 	ASSERT0P(hdr->b_l1hdr.b_pabd);
 	ASSERT(!HDR_HAS_RABD(hdr));
 	ASSERT(!HDR_SHARED_DATA(hdr));
 	ASSERT(!arc_buf_is_shared(buf));
 
 	callback->awcb_ready(zio, buf, callback->awcb_private);
 
 	if (HDR_IO_IN_PROGRESS(hdr)) {
 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
 	} else {
 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		add_reference(hdr, hdr); /* For IO_IN_PROGRESS. */
 	}
 
 	if (BP_IS_PROTECTED(bp)) {
 		/* ZIL blocks are written through zio_rewrite */
 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 
 		if (BP_SHOULD_BYTESWAP(bp)) {
 			if (BP_GET_LEVEL(bp) > 0) {
 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
 			} else {
 				hdr->b_l1hdr.b_byteswap =
 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
 			}
 		} else {
 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
 		}
 
 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv);
 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_PROTECTED);
 	}
 
 	/*
 	 * If this block was written for raw encryption but the zio layer
 	 * ended up only authenticating it, adjust the buffer flags now.
 	 */
 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
 	}
 
 	/* this must be done after the buffer flags are adjusted */
 	arc_cksum_compute(buf);
 
 	enum zio_compress compress;
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
 		compress = ZIO_COMPRESS_OFF;
 	} else {
 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
 		compress = BP_GET_COMPRESS(bp);
 	}
 	HDR_SET_PSIZE(hdr, psize);
 	arc_hdr_set_compress(hdr, compress);
 	hdr->b_complevel = zio->io_prop.zp_complevel;
 
 	if (zio->io_error != 0 || psize == 0)
 		goto out;
 
 	/*
 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
 	 * but to copy the data into b_radb. If the hdr is compressed, the data
 	 * we want is available from the zio, otherwise we can take it from
 	 * the buf.
 	 *
 	 * We might be able to share the buf's data with the hdr here. However,
 	 * doing so would cause the ARC to be full of linear ABDs if we write a
 	 * lot of shareable data. As a compromise, we check whether scattered
 	 * ABDs are allowed, and assume that if they are then the user wants
 	 * the ARC to be primarily filled with them regardless of the data being
 	 * written. Therefore, if they're allowed then we allocate one and copy
 	 * the data into it; otherwise, we share the data directly if we can.
 	 */
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT3U(psize, >, 0);
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 		    ARC_HDR_USE_RESERVE);
 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 	} else if (!(HDR_UNCACHED(hdr) ||
 	    abd_size_alloc_linear(arc_buf_size(buf))) ||
 	    !arc_can_share(hdr, buf)) {
 		/*
 		 * Ideally, we would always copy the io_abd into b_pabd, but the
 		 * user may have disabled compressed ARC, thus we must check the
 		 * hdr's compression setting rather than the io_bp's.
 		 */
 		if (BP_IS_ENCRYPTED(bp)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_ALLOC_RDATA |
 			    ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
 		    !ARC_BUF_COMPRESSED(buf)) {
 			ASSERT3U(psize, >, 0);
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
 		} else {
 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
 			arc_hdr_alloc_abd(hdr, ARC_HDR_USE_RESERVE);
 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
 			    arc_buf_size(buf));
 		}
 	} else {
 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, buf);
 		ASSERT(ARC_BUF_LAST(buf));
 
 		arc_share_buf(hdr, buf);
 	}
 
 out:
 	arc_hdr_verify(hdr, bp);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 arc_write_children_ready(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 
 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
 }
 
 static void
 arc_write_done(zio_t *zio)
 {
 	arc_write_callback_t *callback = zio->io_private;
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
 	ASSERT0P(hdr->b_l1hdr.b_acb);
 
 	if (zio->io_error == 0) {
 		arc_hdr_verify(hdr, zio->io_bp);
 
 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
 			buf_discard_identity(hdr);
 		} else {
 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
 			hdr->b_birth = BP_GET_PHYSICAL_BIRTH(zio->io_bp);
 		}
 	} else {
 		ASSERT(HDR_EMPTY(hdr));
 	}
 
 	/*
 	 * If the block to be written was all-zero or compressed enough to be
 	 * embedded in the BP, no write was performed so there will be no
 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
 	 * (and uncached).
 	 */
 	if (!HDR_EMPTY(hdr)) {
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
 		ASSERT0(zio->io_error);
 
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
 		if (exists != NULL) {
 			/*
 			 * This can only happen if we overwrite for
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad overwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 				ASSERT(zfs_refcount_is_zero(
 				    &exists->b_l1hdr.b_refcnt));
 				arc_change_state(arc_anon, exists);
 				arc_hdr_destroy(exists);
 				mutex_exit(hash_lock);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT0P(exists);
 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
 				/* nopwrite */
 				ASSERT(zio->io_prop.zp_nopwrite);
 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
 					panic("bad nopwrite, hdr=%p exists=%p",
 					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 				ASSERT(ARC_BUF_LAST(hdr->b_l1hdr.b_buf));
 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
 				ASSERT(BP_GET_DEDUP(zio->io_bp));
 				ASSERT0(BP_GET_LEVEL(zio->io_bp));
 			}
 		}
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 		/* if it's not anon, we are doing a scrub */
 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
 			arc_access(hdr, 0, B_FALSE);
 		mutex_exit(hash_lock);
 	} else {
 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
 		VERIFY3S(remove_reference(hdr, hdr), >, 0);
 	}
 
 	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	abd_free(zio->io_abd);
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
     blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc,
     const zio_prop_t *zp, arc_write_done_func_t *ready,
     arc_write_done_func_t *children_ready, arc_write_done_func_t *done,
     void *private, zio_priority_t priority, int zio_flags,
     const zbookmark_phys_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
 	zio_prop_t localprop = *zp;
 
 	ASSERT3P(ready, !=, NULL);
 	ASSERT3P(done, !=, NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
 	ASSERT0P(hdr->b_l1hdr.b_acb);
 	ASSERT3P(hdr->b_l1hdr.b_buf, !=, NULL);
 	if (uncached)
 		arc_hdr_set_flags(hdr, ARC_FLAG_UNCACHED);
 	else if (l2arc)
 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
 
 	if (ARC_BUF_ENCRYPTED(buf)) {
 		ASSERT(ARC_BUF_COMPRESSED(buf));
 		localprop.zp_encrypt = B_TRUE;
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		localprop.zp_byteorder =
 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
 		memcpy(localprop.zp_salt, hdr->b_crypt_hdr.b_salt,
 		    ZIO_DATA_SALT_LEN);
 		memcpy(localprop.zp_iv, hdr->b_crypt_hdr.b_iv,
 		    ZIO_DATA_IV_LEN);
 		memcpy(localprop.zp_mac, hdr->b_crypt_hdr.b_mac,
 		    ZIO_DATA_MAC_LEN);
 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
 			localprop.zp_nopwrite = B_FALSE;
 			localprop.zp_copies =
 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
 			localprop.zp_gang_copies =
 			    MIN(localprop.zp_gang_copies, SPA_DVAS_PER_BP - 1);
 		}
 		zio_flags |= ZIO_FLAG_RAW;
 	} else if (ARC_BUF_COMPRESSED(buf)) {
 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
 		localprop.zp_complevel = hdr->b_complevel;
 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
 	}
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_children_ready = children_ready;
 	callback->awcb_done = done;
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
 	/*
 	 * The hdr's b_pabd is now stale, free it now. A new data block
 	 * will be allocated when the zio pipeline calls arc_write_ready().
 	 */
 	if (hdr->b_l1hdr.b_pabd != NULL) {
 		/*
 		 * If the buf is currently sharing the data block with
 		 * the hdr then we need to break that relationship here.
 		 * The hdr will remain with a NULL data pointer and the
 		 * buf will take sole ownership of the block.
 		 */
 		if (ARC_BUF_SHARED(buf)) {
 			arc_unshare_buf(hdr, buf);
 		} else {
 			ASSERT(!arc_buf_is_shared(buf));
 			arc_hdr_free_abd(hdr, B_FALSE);
 		}
 		VERIFY3P(buf->b_data, !=, NULL);
 	}
 
 	if (HDR_HAS_RABD(hdr))
 		arc_hdr_free_abd(hdr, B_TRUE);
 
 	if (!(zio_flags & ZIO_FLAG_RAW))
 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
 
 	ASSERT(!arc_buf_is_shared(buf));
 	ASSERT0P(hdr->b_l1hdr.b_pabd);
 
 	zio = zio_write(pio, spa, txg, bp,
 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
 	    arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
 void
 arc_tempreserve_clear(uint64_t reserve)
 {
 	atomic_add_64(&arc_tempreserve, -reserve);
 	ASSERT((int64_t)arc_tempreserve >= 0);
 }
 
 int
 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
 {
 	int error;
 	uint64_t anon_size;
 
 	if (!arc_no_grow &&
 	    reserve > arc_c/4 &&
 	    reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT))
 		arc_c = MIN(arc_c_max, reserve * 4);
 
 	/*
 	 * Throttle when the calculated memory footprint for the TXG
 	 * exceeds the target ARC size.
 	 */
 	if (reserve > arc_c) {
 		DMU_TX_STAT_BUMP(dmu_tx_memory_reserve);
 		return (SET_ERROR(ERESTART));
 	}
 
 	/*
 	 * Don't count loaned bufs as in flight dirty data to prevent long
 	 * network delays from blocking transactions that are ready to be
 	 * assigned to a txg.
 	 */
 
 	/* assert that it has not wrapped around */
 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
 
 	anon_size = MAX((int64_t)
 	    (zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_DATA]) +
 	    zfs_refcount_count(&arc_anon->arcs_size[ARC_BUFC_METADATA]) -
 	    arc_loaned_bytes), 0);
 
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefore need to
 	 * make sure that there is sufficient available memory for this.
 	 */
 	error = arc_memory_throttle(spa, reserve, txg);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Throttle writes when the amount of dirty data in the cache
 	 * gets too large.  We try to keep the cache less than half full
 	 * of dirty blocks so that our sync times don't grow too large.
 	 *
 	 * In the case of one pool being built on another pool, we want
 	 * to make sure we don't end up throttling the lower (backing)
 	 * pool when the upper pool is the majority contributor to dirty
 	 * data. To insure we make forward progress during throttling, we
 	 * also check the current pool's net dirty data and only throttle
 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
 	 * data in the cache.
 	 *
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
 	uint64_t rarc_c = arc_warm ? arc_c : arc_c_max;
 	if (total_dirty > rarc_c * zfs_arc_dirty_limit_percent / 100 &&
 	    anon_size > rarc_c * zfs_arc_anon_limit_percent / 100 &&
 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
 #ifdef ZFS_DEBUG
 		uint64_t meta_esize = zfs_refcount_count(
 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 		uint64_t data_esize =
 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK rarc_c=%lluK\n",
 		    (u_longlong_t)arc_tempreserve >> 10,
 		    (u_longlong_t)meta_esize >> 10,
 		    (u_longlong_t)data_esize >> 10,
 		    (u_longlong_t)reserve >> 10,
 		    (u_longlong_t)rarc_c >> 10);
 #endif
 		DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle);
 		return (SET_ERROR(ERESTART));
 	}
 	atomic_add_64(&arc_tempreserve, reserve);
 	return (0);
 }
 
 static void
 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
     kstat_named_t *data, kstat_named_t *metadata,
     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
 {
 	data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_DATA]);
 	metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_size[ARC_BUFC_METADATA]);
 	size->value.ui64 = data->value.ui64 + metadata->value.ui64;
 	evict_data->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
 	evict_metadata->value.ui64 =
 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
 }
 
 static int
 arc_kstat_update(kstat_t *ksp, int rw)
 {
 	arc_stats_t *as = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (SET_ERROR(EACCES));
 
 	as->arcstat_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hits);
 	as->arcstat_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_iohits);
 	as->arcstat_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_misses);
 	as->arcstat_demand_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_hits);
 	as->arcstat_demand_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_iohits);
 	as->arcstat_demand_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_data_misses);
 	as->arcstat_demand_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_hits);
 	as->arcstat_demand_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_iohits);
 	as->arcstat_demand_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_metadata_misses);
 	as->arcstat_prefetch_data_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_hits);
 	as->arcstat_prefetch_data_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_iohits);
 	as->arcstat_prefetch_data_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_data_misses);
 	as->arcstat_prefetch_metadata_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_hits);
 	as->arcstat_prefetch_metadata_iohits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_iohits);
 	as->arcstat_prefetch_metadata_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prefetch_metadata_misses);
 	as->arcstat_mru_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_hits);
 	as->arcstat_mru_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mru_ghost_hits);
 	as->arcstat_mfu_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_hits);
 	as->arcstat_mfu_ghost_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mfu_ghost_hits);
 	as->arcstat_uncached_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncached_hits);
 	as->arcstat_deleted.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_deleted);
 	as->arcstat_mutex_miss.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_mutex_miss);
 	as->arcstat_access_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_access_skip);
 	as->arcstat_evict_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_skip);
 	as->arcstat_evict_not_enough.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_not_enough);
 	as->arcstat_evict_l2_cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_cached);
 	as->arcstat_evict_l2_eligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible);
 	as->arcstat_evict_l2_eligible_mfu.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	as->arcstat_evict_l2_eligible_mru.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_eligible_mru);
 	as->arcstat_evict_l2_ineligible.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_ineligible);
 	as->arcstat_evict_l2_skip.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_evict_l2_skip);
 	as->arcstat_hash_elements.value.ui64 =
 	    as->arcstat_hash_elements_max.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_elements);
 	as->arcstat_hash_collisions.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_collisions);
 	as->arcstat_hash_chains.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hash_chains);
 	as->arcstat_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_size);
 	as->arcstat_compressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_compressed_size);
 	as->arcstat_uncompressed_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_uncompressed_size);
 	as->arcstat_overhead_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_overhead_size);
 	as->arcstat_hdr_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_hdr_size);
 	as->arcstat_data_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_data_size);
 	as->arcstat_metadata_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_metadata_size);
 	as->arcstat_dbuf_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #if defined(COMPAT_FREEBSD11)
 	as->arcstat_other_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size) +
 	    aggsum_value(&arc_sums.arcstat_dnode_size) +
 	    wmsum_value(&arc_sums.arcstat_dbuf_size);
 #endif
 
 	arc_kstat_update_state(arc_anon,
 	    &as->arcstat_anon_size,
 	    &as->arcstat_anon_data,
 	    &as->arcstat_anon_metadata,
 	    &as->arcstat_anon_evictable_data,
 	    &as->arcstat_anon_evictable_metadata);
 	arc_kstat_update_state(arc_mru,
 	    &as->arcstat_mru_size,
 	    &as->arcstat_mru_data,
 	    &as->arcstat_mru_metadata,
 	    &as->arcstat_mru_evictable_data,
 	    &as->arcstat_mru_evictable_metadata);
 	arc_kstat_update_state(arc_mru_ghost,
 	    &as->arcstat_mru_ghost_size,
 	    &as->arcstat_mru_ghost_data,
 	    &as->arcstat_mru_ghost_metadata,
 	    &as->arcstat_mru_ghost_evictable_data,
 	    &as->arcstat_mru_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_mfu,
 	    &as->arcstat_mfu_size,
 	    &as->arcstat_mfu_data,
 	    &as->arcstat_mfu_metadata,
 	    &as->arcstat_mfu_evictable_data,
 	    &as->arcstat_mfu_evictable_metadata);
 	arc_kstat_update_state(arc_mfu_ghost,
 	    &as->arcstat_mfu_ghost_size,
 	    &as->arcstat_mfu_ghost_data,
 	    &as->arcstat_mfu_ghost_metadata,
 	    &as->arcstat_mfu_ghost_evictable_data,
 	    &as->arcstat_mfu_ghost_evictable_metadata);
 	arc_kstat_update_state(arc_uncached,
 	    &as->arcstat_uncached_size,
 	    &as->arcstat_uncached_data,
 	    &as->arcstat_uncached_metadata,
 	    &as->arcstat_uncached_evictable_data,
 	    &as->arcstat_uncached_evictable_metadata);
 
 	as->arcstat_dnode_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_dnode_size);
 	as->arcstat_bonus_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_bonus_size);
 	as->arcstat_l2_hits.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_hits);
 	as->arcstat_l2_misses.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_misses);
 	as->arcstat_l2_prefetch_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_prefetch_asize);
 	as->arcstat_l2_mru_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mru_asize);
 	as->arcstat_l2_mfu_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_mfu_asize);
 	as->arcstat_l2_bufc_data_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_data_asize);
 	as->arcstat_l2_bufc_metadata_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	as->arcstat_l2_feeds.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_feeds);
 	as->arcstat_l2_rw_clash.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rw_clash);
 	as->arcstat_l2_read_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_read_bytes);
 	as->arcstat_l2_write_bytes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_write_bytes);
 	as->arcstat_l2_writes_sent.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_sent);
 	as->arcstat_l2_writes_done.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_done);
 	as->arcstat_l2_writes_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_error);
 	as->arcstat_l2_writes_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_writes_lock_retry);
 	as->arcstat_l2_evict_lock_retry.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_lock_retry);
 	as->arcstat_l2_evict_reading.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_reading);
 	as->arcstat_l2_evict_l1cached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_evict_l1cached);
 	as->arcstat_l2_free_on_write.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_free_on_write);
 	as->arcstat_l2_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_abort_lowmem);
 	as->arcstat_l2_cksum_bad.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_cksum_bad);
 	as->arcstat_l2_io_error.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_io_error);
 	as->arcstat_l2_lsize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_lsize);
 	as->arcstat_l2_psize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_psize);
 	as->arcstat_l2_hdr_size.value.ui64 =
 	    aggsum_value(&arc_sums.arcstat_l2_hdr_size);
 	as->arcstat_l2_log_blk_writes.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_writes);
 	as->arcstat_l2_log_blk_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_asize);
 	as->arcstat_l2_log_blk_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_log_blk_count);
 	as->arcstat_l2_rebuild_success.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_success);
 	as->arcstat_l2_rebuild_abort_unsupported.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	as->arcstat_l2_rebuild_abort_io_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	as->arcstat_l2_rebuild_abort_dh_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	as->arcstat_l2_rebuild_abort_cksum_lb_errors.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	as->arcstat_l2_rebuild_abort_lowmem.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	as->arcstat_l2_rebuild_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_size);
 	as->arcstat_l2_rebuild_asize.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_asize);
 	as->arcstat_l2_rebuild_bufs.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs);
 	as->arcstat_l2_rebuild_bufs_precached.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	as->arcstat_l2_rebuild_log_blks.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_l2_rebuild_log_blks);
 	as->arcstat_memory_throttle_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_throttle_count);
 	as->arcstat_memory_direct_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_direct_count);
 	as->arcstat_memory_indirect_count.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_memory_indirect_count);
 
 	as->arcstat_memory_all_bytes.value.ui64 =
 	    arc_all_memory();
 	as->arcstat_memory_free_bytes.value.ui64 =
 	    arc_free_memory();
 	as->arcstat_memory_available_bytes.value.i64 =
 	    arc_available_memory();
 
 	as->arcstat_prune.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prune);
 	as->arcstat_meta_used.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_meta_used);
 	as->arcstat_async_upgrade_sync.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_async_upgrade_sync);
 	as->arcstat_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_predictive_prefetch);
 	as->arcstat_demand_hit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	as->arcstat_demand_iohit_predictive_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	as->arcstat_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_prescient_prefetch);
 	as->arcstat_demand_hit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	as->arcstat_demand_iohit_prescient_prefetch.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	as->arcstat_raw_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_raw_size);
 	as->arcstat_cached_only_in_progress.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_cached_only_in_progress);
 	as->arcstat_abd_chunk_waste_size.value.ui64 =
 	    wmsum_value(&arc_sums.arcstat_abd_chunk_waste_size);
 
 	return (0);
 }
 
 /*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the ARC eviction
  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 static unsigned int
 arc_state_multilist_index_func(multilist_t *ml, void *obj)
 {
 	arc_buf_hdr_t *hdr = obj;
 
 	/*
 	 * We rely on b_dva to generate evenly distributed index
 	 * numbers using buf_hash below. So, as an added precaution,
 	 * let's make sure we never add empty buffers to the arc lists.
 	 */
 	ASSERT(!HDR_EMPTY(hdr));
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * arc_buf_hdr_t will remain constant throughout its lifetime
 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
 	 * Thus, we don't need to store the header's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
 	    multilist_get_num_sublists(ml));
 }
 
 static unsigned int
 arc_state_l2c_multilist_index_func(multilist_t *ml, void *obj)
 {
 	panic("Header %p insert into arc_l2c_only %p", obj, ml);
 }
 
 #define	WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do {	\
 	if ((do_warn) && (tuning) && ((tuning) != (value))) {	\
 		cmn_err(CE_WARN,				\
 		    "ignoring tunable %s (using %llu instead)",	\
 		    (#tuning), (u_longlong_t)(value));	\
 	}							\
 } while (0)
 
 /*
  * Called during module initialization and periodically thereafter to
  * apply reasonable changes to the exposed performance tunings.  Can also be
  * called explicitly by param_set_arc_*() functions when ARC tunables are
  * updated manually.  Non-zero zfs_* values which differ from the currently set
  * values will be applied.
  */
 void
 arc_tuning_update(boolean_t verbose)
 {
 	uint64_t allmem = arc_all_memory();
 
 	/* Valid range: 32M - <arc_c_max> */
 	if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) &&
 	    (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) &&
 	    (zfs_arc_min <= arc_c_max)) {
 		arc_c_min = zfs_arc_min;
 		arc_c = MAX(arc_c, arc_c_min);
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose);
 
 	/* Valid range: 64M - <all physical memory> */
 	if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) &&
 	    (zfs_arc_max >= MIN_ARC_MAX) && (zfs_arc_max < allmem) &&
 	    (zfs_arc_max > arc_c_min)) {
 		arc_c_max = zfs_arc_max;
 		arc_c = MIN(arc_c, arc_c_max);
 		if (arc_dnode_limit > arc_c_max)
 			arc_dnode_limit = arc_c_max;
 	}
 	WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	arc_dnode_limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit :
 	    MIN(zfs_arc_dnode_limit_percent, 100) * arc_c_max / 100;
 	WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_limit, verbose);
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_grow_retry)
 		arc_grow_retry = zfs_arc_grow_retry;
 
 	/* Valid range: 1 - N */
 	if (zfs_arc_shrink_shift) {
 		arc_shrink_shift = zfs_arc_shrink_shift;
 		arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1);
 	}
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prefetch_ms)
 		arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
 
 	/* Valid range: 1 - N ms */
 	if (zfs_arc_min_prescient_prefetch_ms) {
 		arc_min_prescient_prefetch_ms =
 		    zfs_arc_min_prescient_prefetch_ms;
 	}
 
 	/* Valid range: 0 - 100 */
 	if (zfs_arc_lotsfree_percent <= 100)
 		arc_lotsfree_percent = zfs_arc_lotsfree_percent;
 	WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent,
 	    verbose);
 
 	/* Valid range: 0 - <all physical memory> */
 	if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free))
 		arc_sys_free = MIN(zfs_arc_sys_free, allmem);
 	WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose);
 }
 
 static void
 arc_state_multilist_init(multilist_t *ml,
     multilist_sublist_index_func_t *index_func, int *maxcountp)
 {
 	multilist_create(ml, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), index_func);
 	*maxcountp = MAX(*maxcountp, multilist_get_num_sublists(ml));
 }
 
 static void
 arc_state_init(void)
 {
 	int num_sublists = 0;
 
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_uncached->arcs_list[ARC_BUFC_DATA],
 	    arc_state_multilist_index_func, &num_sublists);
 
 	/*
 	 * L2 headers should never be on the L2 state list since they don't
 	 * have L1 headers allocated.  Special index function asserts that.
 	 */
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 	arc_state_multilist_init(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
 	    arc_state_l2c_multilist_index_func, &num_sublists);
 
 	/*
 	 * Keep track of the number of markers needed to reclaim buffers from
 	 * any ARC state.  The markers will be pre-allocated so as to minimize
 	 * the number of memory allocations performed by the eviction thread.
 	 */
 	arc_state_evict_marker_count = num_sublists;
 
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_create(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA], 0);
 	wmsum_init(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA], 0);
 
 	wmsum_init(&arc_sums.arcstat_hits, 0);
 	wmsum_init(&arc_sums.arcstat_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_demand_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_data_misses, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_hits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_iohits, 0);
 	wmsum_init(&arc_sums.arcstat_prefetch_metadata_misses, 0);
 	wmsum_init(&arc_sums.arcstat_mru_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mru_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_hits, 0);
 	wmsum_init(&arc_sums.arcstat_mfu_ghost_hits, 0);
 	wmsum_init(&arc_sums.arcstat_uncached_hits, 0);
 	wmsum_init(&arc_sums.arcstat_deleted, 0);
 	wmsum_init(&arc_sums.arcstat_mutex_miss, 0);
 	wmsum_init(&arc_sums.arcstat_access_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_skip, 0);
 	wmsum_init(&arc_sums.arcstat_evict_not_enough, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_cached, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mfu, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_eligible_mru, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_ineligible, 0);
 	wmsum_init(&arc_sums.arcstat_evict_l2_skip, 0);
 	wmsum_init(&arc_sums.arcstat_hash_elements, 0);
 	wmsum_init(&arc_sums.arcstat_hash_collisions, 0);
 	wmsum_init(&arc_sums.arcstat_hash_chains, 0);
 	aggsum_init(&arc_sums.arcstat_size, 0);
 	wmsum_init(&arc_sums.arcstat_compressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_uncompressed_size, 0);
 	wmsum_init(&arc_sums.arcstat_overhead_size, 0);
 	wmsum_init(&arc_sums.arcstat_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_data_size, 0);
 	wmsum_init(&arc_sums.arcstat_metadata_size, 0);
 	wmsum_init(&arc_sums.arcstat_dbuf_size, 0);
 	aggsum_init(&arc_sums.arcstat_dnode_size, 0);
 	wmsum_init(&arc_sums.arcstat_bonus_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_hits, 0);
 	wmsum_init(&arc_sums.arcstat_l2_misses, 0);
 	wmsum_init(&arc_sums.arcstat_l2_prefetch_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mru_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_mfu_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_data_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_bufc_metadata_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_feeds, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rw_clash, 0);
 	wmsum_init(&arc_sums.arcstat_l2_read_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_write_bytes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_sent, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_done, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_writes_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_lock_retry, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_reading, 0);
 	wmsum_init(&arc_sums.arcstat_l2_evict_l1cached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_free_on_write, 0);
 	wmsum_init(&arc_sums.arcstat_l2_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_cksum_bad, 0);
 	wmsum_init(&arc_sums.arcstat_l2_io_error, 0);
 	wmsum_init(&arc_sums.arcstat_l2_lsize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_psize, 0);
 	aggsum_init(&arc_sums.arcstat_l2_hdr_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_writes, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_log_blk_count, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_success, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_unsupported, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_io_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_dh_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_abort_lowmem, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_size, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_asize, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_bufs_precached, 0);
 	wmsum_init(&arc_sums.arcstat_l2_rebuild_log_blks, 0);
 	wmsum_init(&arc_sums.arcstat_memory_throttle_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_direct_count, 0);
 	wmsum_init(&arc_sums.arcstat_memory_indirect_count, 0);
 	wmsum_init(&arc_sums.arcstat_prune, 0);
 	wmsum_init(&arc_sums.arcstat_meta_used, 0);
 	wmsum_init(&arc_sums.arcstat_async_upgrade_sync, 0);
 	wmsum_init(&arc_sums.arcstat_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_predictive_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_hit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_demand_iohit_prescient_prefetch, 0);
 	wmsum_init(&arc_sums.arcstat_raw_size, 0);
 	wmsum_init(&arc_sums.arcstat_cached_only_in_progress, 0);
 	wmsum_init(&arc_sums.arcstat_abd_chunk_waste_size, 0);
 
 	arc_anon->arcs_state = ARC_STATE_ANON;
 	arc_mru->arcs_state = ARC_STATE_MRU;
 	arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST;
 	arc_mfu->arcs_state = ARC_STATE_MFU;
 	arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST;
 	arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY;
 	arc_uncached->arcs_state = ARC_STATE_UNCACHED;
 }
 
 static void
 arc_state_fini(void)
 {
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_esize[ARC_BUFC_DATA]);
 
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_anon->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_l2c_only->arcs_size[ARC_BUFC_METADATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_DATA]);
 	zfs_refcount_destroy(&arc_uncached->arcs_size[ARC_BUFC_METADATA]);
 
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_METADATA]);
 	multilist_destroy(&arc_uncached->arcs_list[ARC_BUFC_DATA]);
 
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mru_ghost->arcs_hits[ARC_BUFC_METADATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_DATA]);
 	wmsum_fini(&arc_mfu_ghost->arcs_hits[ARC_BUFC_METADATA]);
 
 	wmsum_fini(&arc_sums.arcstat_hits);
 	wmsum_fini(&arc_sums.arcstat_iohits);
 	wmsum_fini(&arc_sums.arcstat_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_data_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_data_misses);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_demand_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_data_misses);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_hits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_iohits);
 	wmsum_fini(&arc_sums.arcstat_prefetch_metadata_misses);
 	wmsum_fini(&arc_sums.arcstat_mru_hits);
 	wmsum_fini(&arc_sums.arcstat_mru_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_hits);
 	wmsum_fini(&arc_sums.arcstat_mfu_ghost_hits);
 	wmsum_fini(&arc_sums.arcstat_uncached_hits);
 	wmsum_fini(&arc_sums.arcstat_deleted);
 	wmsum_fini(&arc_sums.arcstat_mutex_miss);
 	wmsum_fini(&arc_sums.arcstat_access_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_skip);
 	wmsum_fini(&arc_sums.arcstat_evict_not_enough);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_cached);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mfu);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_eligible_mru);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_ineligible);
 	wmsum_fini(&arc_sums.arcstat_evict_l2_skip);
 	wmsum_fini(&arc_sums.arcstat_hash_elements);
 	wmsum_fini(&arc_sums.arcstat_hash_collisions);
 	wmsum_fini(&arc_sums.arcstat_hash_chains);
 	aggsum_fini(&arc_sums.arcstat_size);
 	wmsum_fini(&arc_sums.arcstat_compressed_size);
 	wmsum_fini(&arc_sums.arcstat_uncompressed_size);
 	wmsum_fini(&arc_sums.arcstat_overhead_size);
 	wmsum_fini(&arc_sums.arcstat_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_data_size);
 	wmsum_fini(&arc_sums.arcstat_metadata_size);
 	wmsum_fini(&arc_sums.arcstat_dbuf_size);
 	aggsum_fini(&arc_sums.arcstat_dnode_size);
 	wmsum_fini(&arc_sums.arcstat_bonus_size);
 	wmsum_fini(&arc_sums.arcstat_l2_hits);
 	wmsum_fini(&arc_sums.arcstat_l2_misses);
 	wmsum_fini(&arc_sums.arcstat_l2_prefetch_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mru_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_mfu_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_data_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_bufc_metadata_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_feeds);
 	wmsum_fini(&arc_sums.arcstat_l2_rw_clash);
 	wmsum_fini(&arc_sums.arcstat_l2_read_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_write_bytes);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_sent);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_done);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_error);
 	wmsum_fini(&arc_sums.arcstat_l2_writes_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_lock_retry);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_reading);
 	wmsum_fini(&arc_sums.arcstat_l2_evict_l1cached);
 	wmsum_fini(&arc_sums.arcstat_l2_free_on_write);
 	wmsum_fini(&arc_sums.arcstat_l2_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_cksum_bad);
 	wmsum_fini(&arc_sums.arcstat_l2_io_error);
 	wmsum_fini(&arc_sums.arcstat_l2_lsize);
 	wmsum_fini(&arc_sums.arcstat_l2_psize);
 	aggsum_fini(&arc_sums.arcstat_l2_hdr_size);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_writes);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_log_blk_count);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_success);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_unsupported);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_io_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_dh_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_cksum_lb_errors);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_abort_lowmem);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_size);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_asize);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_bufs_precached);
 	wmsum_fini(&arc_sums.arcstat_l2_rebuild_log_blks);
 	wmsum_fini(&arc_sums.arcstat_memory_throttle_count);
 	wmsum_fini(&arc_sums.arcstat_memory_direct_count);
 	wmsum_fini(&arc_sums.arcstat_memory_indirect_count);
 	wmsum_fini(&arc_sums.arcstat_prune);
 	wmsum_fini(&arc_sums.arcstat_meta_used);
 	wmsum_fini(&arc_sums.arcstat_async_upgrade_sync);
 	wmsum_fini(&arc_sums.arcstat_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_predictive_prefetch);
 	wmsum_fini(&arc_sums.arcstat_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_hit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_demand_iohit_prescient_prefetch);
 	wmsum_fini(&arc_sums.arcstat_raw_size);
 	wmsum_fini(&arc_sums.arcstat_cached_only_in_progress);
 	wmsum_fini(&arc_sums.arcstat_abd_chunk_waste_size);
 }
 
 uint64_t
 arc_target_bytes(void)
 {
 	return (arc_c);
 }
 
 void
 arc_set_limits(uint64_t allmem)
 {
 	/* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */
 	arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT);
 
 	/* How to set default max varies by platform. */
 	arc_c_max = arc_default_max(arc_c_min, allmem);
 }
 
 void
 arc_init(void)
 {
 	uint64_t percent, allmem = arc_all_memory();
 	mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t),
 	    offsetof(arc_evict_waiter_t, aew_node));
 
 	arc_min_prefetch_ms = 1000;
 	arc_min_prescient_prefetch_ms = 6000;
 
 #if defined(_KERNEL)
 	arc_lowmem_init();
 #endif
 
 	arc_set_limits(allmem);
 
 #ifdef _KERNEL
 	/*
 	 * If zfs_arc_max is non-zero at init, meaning it was set in the kernel
 	 * environment before the module was loaded, don't block setting the
 	 * maximum because it is less than arc_c_min, instead, reset arc_c_min
 	 * to a lower value.
 	 * zfs_arc_min will be handled by arc_tuning_update().
 	 */
 	if (zfs_arc_max != 0 && zfs_arc_max >= MIN_ARC_MAX &&
 	    zfs_arc_max < allmem) {
 		arc_c_max = zfs_arc_max;
 		if (arc_c_min >= arc_c_max) {
 			arc_c_min = MAX(zfs_arc_max / 2,
 			    2ULL << SPA_MAXBLOCKSHIFT);
 		}
 	}
 #else
 	/*
 	 * In userland, there's only the memory pressure that we artificially
 	 * create (see arc_available_memory()).  Don't let arc_c get too
 	 * small, because it can cause transactions to be larger than
 	 * arc_c, causing arc_tempreserve_space() to fail.
 	 */
 	arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT);
 #endif
 
 	arc_c = arc_c_min;
 	/*
 	 * 32-bit fixed point fractions of metadata from total ARC size,
 	 * MRU data from all data and MRU metadata from all metadata.
 	 */
 	arc_meta = (1ULL << 32) / 4;	/* Metadata is 25% of arc_c. */
 	arc_pd = (1ULL << 32) / 2;	/* Data MRU is 50% of data. */
 	arc_pm = (1ULL << 32) / 2;	/* Metadata MRU is 50% of metadata. */
 
 	percent = MIN(zfs_arc_dnode_limit_percent, 100);
 	arc_dnode_limit = arc_c_max * percent / 100;
 
 	/* Apply user specified tunings */
 	arc_tuning_update(B_TRUE);
 
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
 	if (arc_c < arc_c_min)
 		arc_c = arc_c_min;
 
 	arc_register_hotplug();
 
 	arc_state_init();
 
 	buf_init();
 
 	list_create(&arc_prune_list, sizeof (arc_prune_t),
 	    offsetof(arc_prune_t, p_node));
 	mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	arc_prune_taskq = taskq_create("arc_prune", zfs_arc_prune_task_threads,
 	    defclsyspri, 100, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	arc_evict_thread_init();
 
 	list_create(&arc_async_flush_list, sizeof (arc_async_flush_t),
 	    offsetof(arc_async_flush_t, af_node));
 	mutex_init(&arc_async_flush_lock, NULL, MUTEX_DEFAULT, NULL);
 	arc_flush_taskq = taskq_create("arc_flush", MIN(boot_ncpus, 4),
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC);
 
 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 
 	if (arc_ksp != NULL) {
 		arc_ksp->ks_data = &arc_stats;
 		arc_ksp->ks_update = arc_kstat_update;
 		kstat_install(arc_ksp);
 	}
 
 	arc_state_evict_markers =
 	    arc_state_alloc_markers(arc_state_evict_marker_count);
 	arc_evict_zthr = zthr_create_timer("arc_evict",
 	    arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1), defclsyspri);
 	arc_reap_zthr = zthr_create_timer("arc_reap",
 	    arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1), minclsyspri);
 
 	arc_warm = B_FALSE;
 
 	/*
 	 * Calculate maximum amount of dirty data per pool.
 	 *
 	 * If it has been set by a module parameter, take that.
 	 * Otherwise, use a percentage of physical memory defined by
 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
 	 * zfs_dirty_data_max_max (default 4G or 25% of physical memory).
 	 */
 #ifdef __LP64__
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #else
 	if (zfs_dirty_data_max_max == 0)
 		zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024,
 		    allmem * zfs_dirty_data_max_max_percent / 100);
 #endif
 
 	if (zfs_dirty_data_max == 0) {
 		zfs_dirty_data_max = allmem *
 		    zfs_dirty_data_max_percent / 100;
 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
 		    zfs_dirty_data_max_max);
 	}
 
 	if (zfs_wrlog_data_max == 0) {
 
 		/*
 		 * dp_wrlog_total is reduced for each txg at the end of
 		 * spa_sync(). However, dp_dirty_total is reduced every time
 		 * a block is written out. Thus under normal operation,
 		 * dp_wrlog_total could grow 2 times as big as
 		 * zfs_dirty_data_max.
 		 */
 		zfs_wrlog_data_max = zfs_dirty_data_max * 2;
 	}
 }
 
 void
 arc_fini(void)
 {
 	arc_prune_t *p;
 
 #ifdef _KERNEL
 	arc_lowmem_fini();
 #endif /* _KERNEL */
 
 	/* Wait for any background flushes */
 	taskq_wait(arc_flush_taskq);
 	taskq_destroy(arc_flush_taskq);
 
 	/* Use B_TRUE to ensure *all* buffers are evicted */
 	arc_flush(NULL, B_TRUE);
 
 	if (arc_ksp != NULL) {
 		kstat_delete(arc_ksp);
 		arc_ksp = NULL;
 	}
 
 	taskq_wait(arc_prune_taskq);
 	taskq_destroy(arc_prune_taskq);
 
 	list_destroy(&arc_async_flush_list);
 	mutex_destroy(&arc_async_flush_lock);
 
 	mutex_enter(&arc_prune_mtx);
 	while ((p = list_remove_head(&arc_prune_list)) != NULL) {
 		(void) zfs_refcount_remove(&p->p_refcnt, &arc_prune_list);
 		zfs_refcount_destroy(&p->p_refcnt);
 		kmem_free(p, sizeof (*p));
 	}
 	mutex_exit(&arc_prune_mtx);
 
 	list_destroy(&arc_prune_list);
 	mutex_destroy(&arc_prune_mtx);
 
 	if (arc_evict_taskq != NULL)
 		taskq_wait(arc_evict_taskq);
 
 	(void) zthr_cancel(arc_evict_zthr);
 	(void) zthr_cancel(arc_reap_zthr);
 	arc_state_free_markers(arc_state_evict_markers,
 	    arc_state_evict_marker_count);
 
 	if (arc_evict_taskq != NULL) {
 		taskq_destroy(arc_evict_taskq);
 		kmem_free(arc_evict_arg,
 		    sizeof (evict_arg_t) * zfs_arc_evict_threads);
 	}
 
 	mutex_destroy(&arc_evict_lock);
 	list_destroy(&arc_evict_waiters);
 
 	/*
 	 * Free any buffers that were tagged for destruction.  This needs
 	 * to occur before arc_state_fini() runs and destroys the aggsum
 	 * values which are updated when freeing scatter ABDs.
 	 */
 	l2arc_do_free_on_write();
 
 	/*
 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
 	 * trigger the release of kmem magazines, which can callback to
 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
 	 */
 	buf_fini();
 	arc_state_fini();
 
 	arc_unregister_hotplug();
 
 	/*
 	 * We destroy the zthrs after all the ARC state has been
 	 * torn down to avoid the case of them receiving any
 	 * wakeup() signals after they are destroyed.
 	 */
 	zthr_destroy(arc_evict_zthr);
 	zthr_destroy(arc_reap_zthr);
 
 	ASSERT0(arc_loaned_bytes);
 }
 
 /*
  * Level 2 ARC
  *
  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
  * It uses dedicated storage devices to hold cached data, which are populated
  * using large infrequent writes.  The main role of this cache is to boost
  * the performance of random read workloads.  The intended L2ARC devices
  * include short-stroked disks, solid state disks, and other media with
  * substantially faster read latency than disk.
  *
  *                 +-----------------------+
  *                 |         ARC           |
  *                 +-----------------------+
  *                    |         ^     ^
  *                    |         |     |
  *      l2arc_feed_thread()    arc_read()
  *                    |         |     |
  *                    |  l2arc read   |
  *                    V         |     |
  *               +---------------+    |
  *               |     L2ARC     |    |
  *               +---------------+    |
  *                   |    ^           |
  *          l2arc_write() |           |
  *                   |    |           |
  *                   V    |           |
  *                 +-------+      +-------+
  *                 | vdev  |      | vdev  |
  *                 | cache |      | cache |
  *                 +-------+      +-------+
  *                 +=========+     .-----.
  *                 :  L2ARC  :    |-_____-|
  *                 : devices :    | Disks |
  *                 +=========+    `-_____-'
  *
  * Read requests are satisfied from the following sources, in order:
  *
  *	1) ARC
  *	2) vdev cache of L2ARC devices
  *	3) L2ARC devices
  *	4) vdev cache of disks
  *	5) disks
  *
  * Some L2ARC device types exhibit extremely slow write performance.
  * To accommodate for this there are some significant differences between
  * the L2ARC and traditional cache design:
  *
  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
  * the ARC behave as usual, freeing buffers and placing headers on ghost
  * lists.  The ARC does not send buffers to the L2ARC during eviction as
  * this would add inflated write latencies for all ARC memory pressure.
  *
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
  * not already there. It scans until a headroom of buffers is satisfied,
  * which itself is a buffer for ARC eviction. If a compressible buffer is
  * found during scanning and selected for writing to an L2ARC device, we
  * temporarily boost scanning headroom during the next scan cycle to make
  * sure we adapt to compression effects (which might significantly reduce
  * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
  *	       head -->                        tail
  *	        +---------------------+----------+
  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
  *	        +---------------------+----------+   |   o L2ARC eligible
  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
  *	        +---------------------+----------+   |
  *	             15.9 Gbytes      ^ 32 Mbytes    |
  *	                           headroom          |
  *	                                      l2arc_feed_thread()
  *	                                             |
  *	                 l2arc write hand <--[oooo]--'
  *	                         |           8 Mbyte
  *	                         |          write max
  *	                         V
  *		  +==============================+
  *	L2ARC dev |####|#|###|###|    |####| ... |
  *	          +==============================+
  *	                     32 Gbytes
  *
  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
  * evicted, then the L2ARC has cached a buffer much sooner than it probably
  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
  * safe to say that this is an uncommon case, since buffers at the end of
  * the ARC lists have moved there due to inactivity.
  *
  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
  * then the L2ARC simply misses copying some buffers.  This serves as a
  * pressure valve to prevent heavy read workloads from both stalling the ARC
  * with waits and clogging the L2ARC with writes.  This also helps prevent
  * the potential for the L2ARC to churn if it attempts to cache content too
  * quickly, such as during backups of the entire pool.
  *
  * 5. After system boot and before the ARC has filled main memory, there are
  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
  * lists can remain mostly static.  Instead of searching from tail of these
  * lists as pictured, the l2arc_feed_thread() will search from the list heads
  * for eligible buffers, greatly increasing its chance of finding them.
  *
  * The L2ARC device write speed is also boosted during this time so that
  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
  * there are no L2ARC reads, and no fear of degrading read performance
  * through increased writes.
  *
  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
  * the vdev queue can aggregate them into larger and fewer writes.  Each
  * device is written to in a rotor fashion, sweeping writes through
  * available space then repeating.
  *
  * 7. The L2ARC does not store dirty content.  It never needs to flush
  * write buffers back to disk based storage.
  *
  * 8. If an ARC buffer is written (and dirtied) which also exists in the
  * L2ARC, the now stale L2ARC buffer is immediately dropped.
  *
  * The performance of the L2ARC can be tweaked by a number of tunables, which
  * may be necessary for different workloads:
  *
  *	l2arc_write_max		max write bytes per interval
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
  *	l2arc_headroom_boost	when we find compressed buffers during ARC
  *				scanning, we multiply headroom by this
  *				percentage factor for the next scan cycle,
  *				since more compressed buffers are likely to
  *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
  *
  * There are three key functions that control how the L2ARC warms up:
  *
  *	l2arc_write_eligible()	check if a buffer is eligible to cache
  *	l2arc_write_size()	calculate how much to write
  *	l2arc_write_interval()	calculate sleep delay between writes
  *
  * These three functions determine what to write, how much, and how quickly
  * to send writes.
  *
  * L2ARC persistence:
  *
  * When writing buffers to L2ARC, we periodically add some metadata to
  * make sure we can pick them up after reboot, thus dramatically reducing
  * the impact that any downtime has on the performance of storage systems
  * with large caches.
  *
  * The implementation works fairly simply by integrating the following two
  * modifications:
  *
  * *) When writing to the L2ARC, we occasionally write a "l2arc log block",
  *    which is an additional piece of metadata which describes what's been
  *    written. This allows us to rebuild the arc_buf_hdr_t structures of the
  *    main ARC buffers. There are 2 linked-lists of log blocks headed by
  *    dh_start_lbps[2]. We alternate which chain we append to, so they are
  *    time-wise and offset-wise interleaved, but that is an optimization rather
  *    than for correctness. The log block also includes a pointer to the
  *    previous block in its chain.
  *
  * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
  *    for our header bookkeeping purposes. This contains a device header,
  *    which contains our top-level reference structures. We update it each
  *    time we write a new log block, so that we're able to locate it in the
  *    L2ARC device. If this write results in an inconsistent device header
  *    (e.g. due to power failure), we detect this by verifying the header's
  *    checksum and simply fail to reconstruct the L2ARC after reboot.
  *
  * Implementation diagram:
  *
  * +=== L2ARC device (not to scale) ======================================+
  * |       ___two newest log block pointers__.__________                  |
  * |      /                                   \dh_start_lbps[1]           |
  * |	 /				       \         \dh_start_lbps[0]|
  * |.___/__.                                    V         V               |
  * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
  * ||   hdr|      ^         /^       /^        /         /                |
  * |+------+  ...--\-------/  \-----/--\------/         /                 |
  * |                \--------------/    \--------------/                  |
  * +======================================================================+
  *
  * As can be seen on the diagram, rather than using a simple linked list,
  * we use a pair of linked lists with alternating elements. This is a
  * performance enhancement due to the fact that we only find out the
  * address of the next log block access once the current block has been
  * completely read in. Obviously, this hurts performance, because we'd be
  * keeping the device's I/O queue at only a 1 operation deep, thus
  * incurring a large amount of I/O round-trip latency. Having two lists
  * allows us to fetch two log blocks ahead of where we are currently
  * rebuilding L2ARC buffers.
  *
  * On-device data structures:
  *
  * L2ARC device header:	l2arc_dev_hdr_phys_t
  * L2ARC log block:	l2arc_log_blk_phys_t
  *
  * L2ARC reconstruction:
  *
  * When writing data, we simply write in the standard rotary fashion,
  * evicting buffers as we go and simply writing new data over them (writing
  * a new log block every now and then). This obviously means that once we
  * loop around the end of the device, we will start cutting into an already
  * committed log block (and its referenced data buffers), like so:
  *
  *    current write head__       __old tail
  *                        \     /
  *                        V    V
  * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
  *                         ^    ^^^^^^^^^___________________________________
  *                         |                                                \
  *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
  *
  * When importing the pool, we detect this situation and use it to stop
  * our scanning process (see l2arc_rebuild).
  *
  * There is one significant caveat to consider when rebuilding ARC contents
  * from an L2ARC device: what about invalidated buffers? Given the above
  * construction, we cannot update blocks which we've already written to amend
  * them to remove buffers which were invalidated. Thus, during reconstruction,
  * we might be populating the cache with buffers for data that's not on the
  * main pool anymore, or may have been overwritten!
  *
  * As it turns out, this isn't a problem. Every arc_read request includes
  * both the DVA and, crucially, the birth TXG of the BP the caller is
  * looking for. So even if the cache were populated by completely rotten
  * blocks for data that had been long deleted and/or overwritten, we'll
  * never actually return bad data from the cache, since the DVA with the
  * birth TXG uniquely identify a block in space and time - once created,
  * a block is immutable on disk. The worst thing we have done is wasted
  * some time and memory at l2arc rebuild to reconstruct outdated ARC
  * entries that will get dropped from the l2arc as it is being updated
  * with new blocks.
  *
  * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write
  * hand are not restored. This is done by saving the offset (in bytes)
  * l2arc_evict() has evicted to in the L2ARC device header and taking it
  * into account when restoring buffers.
  */
 
 static boolean_t
 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
 {
 	/*
 	 * A buffer is *not* eligible for the L2ARC if it:
 	 * 1. belongs to a different spa.
 	 * 2. is already cached on the L2ARC.
 	 * 3. has an I/O in progress (it may be an incomplete read).
 	 * 4. is flagged not eligible (zfs property).
 	 */
 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 static uint64_t
 l2arc_write_size(l2arc_dev_t *dev)
 {
 	uint64_t size;
 
 	/*
 	 * Make sure our globals have meaningful values in case the user
 	 * altered them.
 	 */
 	size = l2arc_write_max;
 	if (size == 0) {
 		cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, "
 		    "resetting it to the default (%d)", L2ARC_WRITE_SIZE);
 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
 	}
 
 	if (arc_warm == B_FALSE)
 		size += l2arc_write_boost;
 
 	/* We need to add in the worst case scenario of log block overhead. */
 	size += l2arc_log_blk_overhead(size, dev);
 	if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) {
 		/*
 		 * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100)
 		 * times the writesize, whichever is greater.
 		 */
 		size += MAX(64 * 1024 * 1024,
 		    (size * l2arc_trim_ahead) / 100);
 	}
 
 	/*
 	 * Make sure the write size does not exceed the size of the cache
 	 * device. This is important in l2arc_evict(), otherwise infinite
 	 * iteration can occur.
 	 */
 	size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4);
 
 	size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift);
 
 	return (size);
 
 }
 
 static clock_t
 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
 {
 	clock_t interval, next, now;
 
 	/*
 	 * If the ARC lists are busy, increase our write rate; if the
 	 * lists are stale, idle back.  This is achieved by checking
 	 * how much we previously wrote - if it was more than half of
 	 * what we wanted, schedule the next write much sooner.
 	 */
 	if (l2arc_feed_again && wrote > (wanted / 2))
 		interval = (hz * l2arc_feed_min_ms) / 1000;
 	else
 		interval = hz * l2arc_feed_secs;
 
 	now = ddi_get_lbolt();
 	next = MAX(now, MIN(now + interval, began + interval));
 
 	return (next);
 }
 
 static boolean_t
 l2arc_dev_invalid(const l2arc_dev_t *dev)
 {
 	/*
 	 * We want to skip devices that are being rebuilt, trimmed,
 	 * removed, or belong to a spa that is being exported.
 	 */
 	return (dev->l2ad_vdev == NULL || vdev_is_dead(dev->l2ad_vdev) ||
 	    dev->l2ad_rebuild || dev->l2ad_trim_all ||
 	    dev->l2ad_spa == NULL || dev->l2ad_spa->spa_is_exporting);
 }
 
 /*
  * Cycle through L2ARC devices.  This is how L2ARC load balances.
  * If a device is returned, this also returns holding the spa config lock.
  */
 static l2arc_dev_t *
 l2arc_dev_get_next(void)
 {
 	l2arc_dev_t *first, *next = NULL;
 
 	/*
 	 * Lock out the removal of spas (spa_namespace_lock), then removal
 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
 	 * both locks will be dropped and a spa config lock held instead.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	mutex_enter(&l2arc_dev_mtx);
 
 	/* if there are no vdevs, there is nothing to do */
 	if (l2arc_ndev == 0)
 		goto out;
 
 	first = NULL;
 	next = l2arc_dev_last;
 	do {
 		/* loop around the list looking for a non-faulted vdev */
 		if (next == NULL) {
 			next = list_head(l2arc_dev_list);
 		} else {
 			next = list_next(l2arc_dev_list, next);
 			if (next == NULL)
 				next = list_head(l2arc_dev_list);
 		}
 
 		/* if we have come back to the start, bail out */
 		if (first == NULL)
 			first = next;
 		else if (next == first)
 			break;
 
 		ASSERT3P(next, !=, NULL);
 	} while (l2arc_dev_invalid(next));
 
 	/* if we were unable to find any usable vdevs, return NULL */
 	if (l2arc_dev_invalid(next))
 		next = NULL;
 
 	l2arc_dev_last = next;
 
 out:
 	mutex_exit(&l2arc_dev_mtx);
 
 	/*
 	 * Grab the config lock to prevent the 'next' device from being
 	 * removed while we are writing to it.
 	 */
 	if (next != NULL)
 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
 	mutex_exit(&spa_namespace_lock);
 
 	return (next);
 }
 
 /*
  * Free buffers that were tagged for destruction.
  */
 static void
 l2arc_do_free_on_write(void)
 {
 	l2arc_data_free_t *df;
 
 	mutex_enter(&l2arc_free_on_write_mtx);
 	while ((df = list_remove_head(l2arc_free_on_write)) != NULL) {
 		ASSERT3P(df->l2df_abd, !=, NULL);
 		abd_free(df->l2df_abd);
 		kmem_free(df, sizeof (l2arc_data_free_t));
 	}
 	mutex_exit(&l2arc_free_on_write_mtx);
 }
 
 /*
  * A write to a cache device has completed.  Update all headers to allow
  * reads from these buffers to begin.
  */
 static void
 l2arc_write_done(zio_t *zio)
 {
 	l2arc_write_callback_t	*cb;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	l2arc_dev_t		*dev;
 	l2arc_dev_hdr_phys_t	*l2dhdr;
 	list_t			*buflist;
 	arc_buf_hdr_t		*head, *hdr, *hdr_prev;
 	kmutex_t		*hash_lock;
 	int64_t			bytes_dropped = 0;
 
 	cb = zio->io_private;
 	ASSERT3P(cb, !=, NULL);
 	dev = cb->l2wcb_dev;
 	l2dhdr = dev->l2ad_dev_hdr;
 	ASSERT3P(dev, !=, NULL);
 	head = cb->l2wcb_head;
 	ASSERT3P(head, !=, NULL);
 	buflist = &dev->l2ad_buflist;
 	ASSERT3P(buflist, !=, NULL);
 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
 	    l2arc_write_callback_t *, cb);
 
 	/*
 	 * All writes completed, or an error was hit.
 	 */
 top:
 	mutex_enter(&dev->l2ad_mtx);
 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock. We must retry so we
 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
 
 			/*
 			 * We don't want to rescan the headers we've
 			 * already marked as having been written out, so
 			 * we reinsert the head node so we can pick up
 			 * where we left off.
 			 */
 			list_remove(buflist, head);
 			list_insert_after(buflist, hdr, head);
 
 			mutex_exit(&dev->l2ad_mtx);
 
 			/*
 			 * We wait for the hash lock to become available
 			 * to try and prevent busy waiting, and increase
 			 * the chance we'll be able to acquire the lock
 			 * the next time around.
 			 */
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto top;
 		}
 
 		/*
 		 * We could not have been moved into the arc_l2c_only
 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
 		 * bit being set. Let's just ensure that's being enforced.
 		 */
 		ASSERT(HDR_HAS_L1HDR(hdr));
 
 		/*
 		 * Skipped - drop L2ARC entry and mark the header as no
 		 * longer L2 eligibile.
 		 */
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, hdr);
 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
 
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			l2arc_hdr_arcstats_decrement(hdr);
 
 			ASSERT(dev->l2ad_vdev != NULL);
 
 			bytes_dropped +=
 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 		}
 
 		/*
 		 * Allow ARC to begin reads and ghost list evictions to
 		 * this L2ARC entry.
 		 */
 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
 
 		mutex_exit(hash_lock);
 	}
 
 	/*
 	 * Free the allocated abd buffers for writing the log blocks.
 	 * If the zio failed reclaim the allocated space and remove the
 	 * pointers to these log blocks from the log block pointer list
 	 * of the L2ARC device.
 	 */
 	while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) {
 		abd_free(abd_buf->abd);
 		zio_buf_free(abd_buf, sizeof (*abd_buf));
 		if (zio->io_error != 0) {
 			lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list);
 			/*
 			 * L2BLK_GET_PSIZE returns aligned size for log
 			 * blocks.
 			 */
 			uint64_t asize =
 			    L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop);
 			bytes_dropped += asize;
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
 			    lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 	list_destroy(&cb->l2wcb_abd_list);
 
 	if (zio->io_error != 0) {
 		ARCSTAT_BUMP(arcstat_l2_writes_error);
 
 		/*
 		 * Restore the lbps array in the header to its previous state.
 		 * If the list of log block pointers is empty, zero out the
 		 * log block pointers in the device header.
 		 */
 		lb_ptr_buf = list_head(&dev->l2ad_lbptr_list);
 		for (int i = 0; i < 2; i++) {
 			if (lb_ptr_buf == NULL) {
 				/*
 				 * If the list is empty zero out the device
 				 * header. Otherwise zero out the second log
 				 * block pointer in the header.
 				 */
 				if (i == 0) {
 					memset(l2dhdr, 0,
 					    dev->l2ad_dev_hdr_asize);
 				} else {
 					memset(&l2dhdr->dh_start_lbps[i], 0,
 					    sizeof (l2arc_log_blkptr_t));
 				}
 				break;
 			}
 			memcpy(&l2dhdr->dh_start_lbps[i], lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			lb_ptr_buf = list_next(&dev->l2ad_lbptr_list,
 			    lb_ptr_buf);
 		}
 	}
 
 	ARCSTAT_BUMP(arcstat_l2_writes_done);
 	list_remove(buflist, head);
 	ASSERT(!HDR_HAS_L1HDR(head));
 	kmem_cache_free(hdr_l2only_cache, head);
 	mutex_exit(&dev->l2ad_mtx);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
 
 	l2arc_do_free_on_write();
 
 	kmem_free(cb, sizeof (l2arc_write_callback_t));
 }
 
 static int
 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
 {
 	int ret;
 	spa_t *spa = zio->io_spa;
 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
 	blkptr_t *bp = zio->io_bp;
 	uint8_t salt[ZIO_DATA_SALT_LEN];
 	uint8_t iv[ZIO_DATA_IV_LEN];
 	uint8_t mac[ZIO_DATA_MAC_LEN];
 	boolean_t no_crypt = B_FALSE;
 
 	/*
 	 * ZIL data is never be written to the L2ARC, so we don't need
 	 * special handling for its unique MAC storage.
 	 */
 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 
 	/*
 	 * If the data was encrypted, decrypt it now. Note that
 	 * we must check the bp here and not the hdr, since the
 	 * hdr does not have its encryption parameters updated
 	 * until arc_read_done().
 	 */
 	if (BP_IS_ENCRYPTED(bp)) {
 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 
 		zio_crypt_decode_params_bp(bp, salt, iv);
 		zio_crypt_decode_mac_bp(bp, mac);
 
 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
 		    hdr->b_l1hdr.b_pabd, &no_crypt);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		/*
 		 * If we actually performed decryption, replace b_pabd
 		 * with the decrypted data. Otherwise we can just throw
 		 * our decryption buffer away.
 		 */
 		if (!no_crypt) {
 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 			    arc_hdr_size(hdr), hdr);
 			hdr->b_l1hdr.b_pabd = eabd;
 			zio->io_abd = eabd;
 		} else {
 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
 		}
 	}
 
 	/*
 	 * If the L2ARC block was compressed, but ARC compression
 	 * is disabled we decompress the data into a new buffer and
 	 * replace the existing data.
 	 */
 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) {
 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr,
 		    ARC_HDR_USE_RESERVE);
 
 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
 		    hdr->b_l1hdr.b_pabd, cabd, HDR_GET_PSIZE(hdr),
 		    HDR_GET_LSIZE(hdr), &hdr->b_complevel);
 		if (ret != 0) {
 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
 			goto error;
 		}
 
 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
 		    arc_hdr_size(hdr), hdr);
 		hdr->b_l1hdr.b_pabd = cabd;
 		zio->io_abd = cabd;
 		zio->io_size = HDR_GET_LSIZE(hdr);
 	}
 
 	return (0);
 
 error:
 	return (ret);
 }
 
 
 /*
  * A read to a cache device completed.  Validate buffer contents before
  * handing over to the regular ARC routines.
  */
 static void
 l2arc_read_done(zio_t *zio)
 {
 	int tfm_error = 0;
 	l2arc_read_callback_t *cb = zio->io_private;
 	arc_buf_hdr_t *hdr;
 	kmutex_t *hash_lock;
 	boolean_t valid_cksum;
 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
 
 	ASSERT3P(zio->io_vd, !=, NULL);
 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
 
 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
 
 	ASSERT3P(cb, !=, NULL);
 	hdr = cb->l2rcb_hdr;
 	ASSERT3P(hdr, !=, NULL);
 
 	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
 	 * If the data was read into a temporary buffer,
 	 * move it and free the buffer.
 	 */
 	if (cb->l2rcb_abd != NULL) {
 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
 		if (zio->io_error == 0) {
 			if (using_rdata) {
 				abd_copy(hdr->b_crypt_hdr.b_rabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			} else {
 				abd_copy(hdr->b_l1hdr.b_pabd,
 				    cb->l2rcb_abd, arc_hdr_size(hdr));
 			}
 		}
 
 		/*
 		 * The following must be done regardless of whether
 		 * there was an error:
 		 * - free the temporary buffer
 		 * - point zio to the real ARC buffer
 		 * - set zio size accordingly
 		 * These are required because zio is either re-used for
 		 * an I/O of the block in the case of the error
 		 * or the zio is passed to arc_read_done() and it
 		 * needs real data.
 		 */
 		abd_free(cb->l2rcb_abd);
 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
 
 		if (using_rdata) {
 			ASSERT(HDR_HAS_RABD(hdr));
 			zio->io_abd = zio->io_orig_abd =
 			    hdr->b_crypt_hdr.b_rabd;
 		} else {
 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
 		}
 	}
 
 	ASSERT3P(zio->io_abd, !=, NULL);
 
 	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
 	zio->io_prop.zp_complevel = hdr->b_complevel;
 
 	valid_cksum = arc_cksum_is_equal(hdr, zio);
 
 	/*
 	 * b_rabd will always match the data as it exists on disk if it is
 	 * being used. Therefore if we are reading into b_rabd we do not
 	 * attempt to untransform the data.
 	 */
 	if (valid_cksum && !using_rdata)
 		tfm_error = l2arc_untransform(zio, cb);
 
 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
 	    !HDR_L2_EVICTED(hdr)) {
 		mutex_exit(hash_lock);
 		zio->io_private = hdr;
 		arc_read_done(zio);
 	} else {
 		/*
 		 * Buffer didn't survive caching.  Increment stats and
 		 * reissue to the original storage device.
 		 */
 		if (zio->io_error != 0) {
 			ARCSTAT_BUMP(arcstat_l2_io_error);
 		} else {
 			zio->io_error = SET_ERROR(EIO);
 		}
 		if (!valid_cksum || tfm_error != 0)
 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
 
 		/*
 		 * If there's no waiter, issue an async i/o to the primary
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
 		if (zio->io_waiter == NULL) {
 			zio_t *pio = zio_unique_parent(zio);
 			void *abd = (using_rdata) ?
 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
 
 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
 
 			zio = zio_read(pio, zio->io_spa, zio->io_bp,
 			    abd, zio->io_size, arc_read_done,
 			    hdr, zio->io_priority, cb->l2rcb_flags,
 			    &cb->l2rcb_zb);
 
 			/*
 			 * Original ZIO will be freed, so we need to update
 			 * ARC header with the new ZIO pointer to be used
 			 * by zio_change_priority() in arc_read().
 			 */
 			for (struct arc_callback *acb = hdr->b_l1hdr.b_acb;
 			    acb != NULL; acb = acb->acb_next)
 				acb->acb_zio_head = zio;
 
 			mutex_exit(hash_lock);
 			zio_nowait(zio);
 		} else {
 			mutex_exit(hash_lock);
 		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * This is the list priority from which the L2ARC will search for pages to
  * cache.  This is used within loops (0..3) to cycle through lists in the
  * desired order.  This order can have a significant effect on cache
  * performance.
  *
  * Currently the metadata lists are hit first, MFU then MRU, followed by
  * the data lists.  This function returns a locked list, and also returns
  * the lock pointer.
  */
 static multilist_sublist_t *
 l2arc_sublist_lock(int list_num)
 {
 	multilist_t *ml = NULL;
 	unsigned int idx;
 
 	ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES);
 
 	switch (list_num) {
 	case 0:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 1:
 		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
 		break;
 	case 2:
 		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
 		break;
 	case 3:
 		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
 		break;
 	default:
 		return (NULL);
 	}
 
 	/*
 	 * Return a randomly-selected sublist. This is acceptable
 	 * because the caller feeds only a little bit of data for each
 	 * call (8MB). Subsequent calls will result in different
 	 * sublists being selected.
 	 */
 	idx = multilist_get_random_index(ml);
 	return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
  * Calculates the maximum overhead of L2ARC metadata log blocks for a given
  * L2ARC write size. l2arc_evict and l2arc_write_size need to include this
  * overhead in processing to make sure there is enough headroom available
  * when writing buffers.
  */
 static inline uint64_t
 l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev)
 {
 	if (dev->l2ad_log_entries == 0) {
 		return (0);
 	} else {
 		ASSERT(dev->l2ad_vdev != NULL);
 
 		uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT;
 
 		uint64_t log_blocks = (log_entries +
 		    dev->l2ad_log_entries - 1) /
 		    dev->l2ad_log_entries;
 
 		return (vdev_psize_to_asize(dev->l2ad_vdev,
 		    sizeof (l2arc_log_blk_phys_t)) * log_blocks);
 	}
 }
 
 /*
  * Evict buffers from the device write hand to the distance specified in
  * bytes. This distance may span populated buffers, it may span nothing.
  * This is clearing a region on the L2ARC device ready for writing.
  * If the 'all' boolean is set, every buffer is evicted.
  */
 static void
 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 {
 	list_t *buflist;
 	arc_buf_hdr_t *hdr, *hdr_prev;
 	kmutex_t *hash_lock;
 	uint64_t taddr;
 	l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev;
 	vdev_t *vd = dev->l2ad_vdev;
 	boolean_t rerun;
 
 	ASSERT(vd != NULL || all);
 	ASSERT(dev->l2ad_spa != NULL || all);
 
 	buflist = &dev->l2ad_buflist;
 
 top:
 	rerun = B_FALSE;
 	if (dev->l2ad_hand + distance > dev->l2ad_end) {
 		/*
 		 * When there is no space to accommodate upcoming writes,
 		 * evict to the end. Then bump the write and evict hands
 		 * to the start and iterate. This iteration does not
 		 * happen indefinitely as we make sure in
 		 * l2arc_write_size() that when the write hand is reset,
 		 * the write size does not exceed the end of the device.
 		 */
 		rerun = B_TRUE;
 		taddr = dev->l2ad_end;
 	} else {
 		taddr = dev->l2ad_hand + distance;
 	}
 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
 	    uint64_t, taddr, boolean_t, all);
 
 	if (!all) {
 		/*
 		 * This check has to be placed after deciding whether to
 		 * iterate (rerun).
 		 */
 		if (dev->l2ad_first) {
 			/*
 			 * This is the first sweep through the device. There is
 			 * nothing to evict. We have already trimmmed the
 			 * whole device.
 			 */
 			goto out;
 		} else {
 			/*
 			 * Trim the space to be evicted.
 			 */
 			if (vd->vdev_has_trim && dev->l2ad_evict < taddr &&
 			    l2arc_trim_ahead > 0) {
 				/*
 				 * We have to drop the spa_config lock because
 				 * vdev_trim_range() will acquire it.
 				 * l2ad_evict already accounts for the label
 				 * size. To prevent vdev_trim_ranges() from
 				 * adding it again, we subtract it from
 				 * l2ad_evict.
 				 */
 				spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev);
 				vdev_trim_simple(vd,
 				    dev->l2ad_evict - VDEV_LABEL_START_SIZE,
 				    taddr - dev->l2ad_evict);
 				spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev,
 				    RW_READER);
 			}
 
 			/*
 			 * When rebuilding L2ARC we retrieve the evict hand
 			 * from the header of the device. Of note, l2arc_evict()
 			 * does not actually delete buffers from the cache
 			 * device, but trimming may do so depending on the
 			 * hardware implementation. Thus keeping track of the
 			 * evict hand is useful.
 			 */
 			dev->l2ad_evict = MAX(dev->l2ad_evict, taddr);
 		}
 	}
 
 retry:
 	mutex_enter(&dev->l2ad_mtx);
 	/*
 	 * We have to account for evicted log blocks. Run vdev_space_update()
 	 * on log blocks whose offset (in bytes) is before the evicted offset
 	 * (in bytes) by searching in the list of pointers to log blocks
 	 * present in the L2ARC device.
 	 */
 	for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf;
 	    lb_ptr_buf = lb_ptr_buf_prev) {
 
 		lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf);
 
 		/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 		uint64_t asize = L2BLK_GET_PSIZE(
 		    (lb_ptr_buf->lb_ptr)->lbp_prop);
 
 		/*
 		 * We don't worry about log blocks left behind (ie
 		 * lbp_payload_start < l2ad_hand) because l2arc_write_buffers()
 		 * will never write more than l2arc_evict() evicts.
 		 */
 		if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) {
 			break;
 		} else {
 			if (vd != NULL)
 				vdev_space_update(vd, -asize, 0, 0);
 			ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize);
 			ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count);
 			zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize,
 			    lb_ptr_buf);
 			(void) zfs_refcount_remove(&dev->l2ad_lb_count,
 			    lb_ptr_buf);
 			list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf);
 			kmem_free(lb_ptr_buf->lb_ptr,
 			    sizeof (l2arc_log_blkptr_t));
 			kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t));
 		}
 	}
 
 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
 		hdr_prev = list_prev(buflist, hdr);
 
 		ASSERT(!HDR_EMPTY(hdr));
 		hash_lock = HDR_LOCK(hdr);
 
 		/*
 		 * We cannot use mutex_enter or else we can deadlock
 		 * with l2arc_write_buffers (due to swapping the order
 		 * the hash lock and l2ad_mtx are taken).
 		 */
 		if (!mutex_tryenter(hash_lock)) {
 			/*
 			 * Missed the hash lock.  Retry.
 			 */
 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
 			mutex_exit(&dev->l2ad_mtx);
 			mutex_enter(hash_lock);
 			mutex_exit(hash_lock);
 			goto retry;
 		}
 
 		/*
 		 * A header can't be on this list if it doesn't have L2 header.
 		 */
 		ASSERT(HDR_HAS_L2HDR(hdr));
 
 		/* Ensure this header has finished being written. */
 		ASSERT(!HDR_L2_WRITING(hdr));
 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
 
 		if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict ||
 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
 			/*
 			 * We've evicted to the target address,
 			 * or the end of the device.
 			 */
 			mutex_exit(hash_lock);
 			break;
 		}
 
 		if (!HDR_HAS_L1HDR(hdr)) {
 			ASSERT(!HDR_L2_READING(hdr));
 			/*
 			 * This doesn't exist in the ARC.  Destroy.
 			 * arc_hdr_destroy() will call list_remove()
 			 * and decrement arcstat_l2_lsize.
 			 */
 			arc_change_state(arc_anon, hdr);
 			arc_hdr_destroy(hdr);
 		} else {
 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
 			/*
 			 * Invalidate issued or about to be issued
 			 * reads, since we may be about to write
 			 * over this location.
 			 */
 			if (HDR_L2_READING(hdr)) {
 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
 			}
 
 			arc_hdr_l2hdr_destroy(hdr);
 		}
 		mutex_exit(hash_lock);
 	}
 	mutex_exit(&dev->l2ad_mtx);
 
 out:
 	/*
 	 * We need to check if we evict all buffers, otherwise we may iterate
 	 * unnecessarily.
 	 */
 	if (!all && rerun) {
 		/*
 		 * Bump device hand to the device start if it is approaching the
 		 * end. l2arc_evict() has already evicted ahead for this case.
 		 */
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 		goto top;
 	}
 
 	if (!all) {
 		/*
 		 * In case of cache device removal (all) the following
 		 * assertions may be violated without functional consequences
 		 * as the device is about to be removed.
 		 */
 		ASSERT3U(dev->l2ad_hand + distance, <=, dev->l2ad_end);
 		if (!dev->l2ad_first)
 			ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 	}
 }
 
 /*
  * Handle any abd transforms that might be required for writing to the L2ARC.
  * If successful, this function will always return an abd with the data
  * transformed as it is on disk in a new abd of asize bytes.
  */
 static int
 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
     abd_t **abd_out)
 {
 	int ret;
 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
 	uint64_t psize = HDR_GET_PSIZE(hdr);
 	uint64_t size = arc_hdr_size(hdr);
 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
 	dsl_crypto_key_t *dck = NULL;
 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
 	boolean_t no_crypt = B_FALSE;
 
 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
 	ASSERT3U(psize, <=, asize);
 
 	/*
 	 * If this data simply needs its own buffer, we simply allocate it
 	 * and copy the data. This may be done to eliminate a dependency on a
 	 * shared buffer or to reallocate the buffer to match asize.
 	 */
 	if (HDR_HAS_RABD(hdr)) {
 		ASSERT3U(asize, >, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
 		abd_zero_off(to_write, psize, asize - psize);
 		goto out;
 	}
 
 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
 	    !HDR_ENCRYPTED(hdr)) {
 		ASSERT3U(size, ==, psize);
 		to_write = abd_alloc_for_io(asize, ismd);
 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
 		if (asize > size)
 			abd_zero_off(to_write, size, asize - size);
 		goto out;
 	}
 
 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
 		cabd = abd_alloc_for_io(MAX(size, asize), ismd);
 		uint64_t csize = zio_compress_data(compress, to_write, &cabd,
 		    size, MIN(size, psize), hdr->b_complevel);
 		if (csize >= size || csize > psize) {
 			/*
 			 * We can't re-compress the block into the original
 			 * psize.  Even if it fits into asize, it does not
 			 * matter, since checksum will never match on read.
 			 */
 			abd_free(cabd);
 			return (SET_ERROR(EIO));
 		}
 		if (asize > csize)
 			abd_zero_off(cabd, csize, asize - csize);
 		to_write = cabd;
 	}
 
 	if (HDR_ENCRYPTED(hdr)) {
 		eabd = abd_alloc_for_io(asize, ismd);
 
 		/*
 		 * If the dataset was disowned before the buffer
 		 * made it to this point, the key to re-encrypt
 		 * it won't be available. In this case we simply
 		 * won't write the buffer to the L2ARC.
 		 */
 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
 		    FTAG, &dck);
 		if (ret != 0)
 			goto error;
 
 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
 		    &no_crypt);
 		if (ret != 0)
 			goto error;
 
 		if (no_crypt)
 			abd_copy(eabd, to_write, psize);
 
 		if (psize != asize)
 			abd_zero_off(eabd, psize, asize - psize);
 
 		/* assert that the MAC we got here matches the one we saved */
 		ASSERT0(memcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 
 		if (to_write == cabd)
 			abd_free(cabd);
 
 		to_write = eabd;
 	}
 
 out:
 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
 	*abd_out = to_write;
 	return (0);
 
 error:
 	if (dck != NULL)
 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
 	if (cabd != NULL)
 		abd_free(cabd);
 	if (eabd != NULL)
 		abd_free(eabd);
 
 	*abd_out = NULL;
 	return (ret);
 }
 
 static void
 l2arc_blk_fetch_done(zio_t *zio)
 {
 	l2arc_read_callback_t *cb;
 
 	cb = zio->io_private;
 	if (cb->l2rcb_abd != NULL)
 		abd_free(cb->l2rcb_abd);
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
 }
 
 /*
  * Find and write ARC buffers to the L2ARC device.
  *
  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  * The headroom_boost is an in-out parameter used to maintain headroom boost
  * state between calls to this function.
  *
  * Returns the number of bytes actually written (which may be smaller than
  * the delta by which the device hand has changed due to alignment and the
  * writing of log blocks).
  */
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t 		*hdr, *head, *marker;
 	uint64_t 		write_asize, write_psize, headroom;
 	boolean_t		full, from_head = !arc_warm;
 	l2arc_write_callback_t	*cb = NULL;
 	zio_t 			*pio, *wzio;
 	uint64_t 		guid = spa_load_guid(spa);
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 
 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
 	pio = NULL;
 	write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
 	marker = arc_state_alloc_marker();
 
 	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	for (int pass = 0; pass < L2ARC_FEED_TYPES; pass++) {
 		/*
 		 * pass == 0: MFU meta
 		 * pass == 1: MRU meta
 		 * pass == 2: MFU data
 		 * pass == 3: MRU data
 		 */
 		if (l2arc_mfuonly == 1) {
 			if (pass == 1 || pass == 3)
 				continue;
 		} else if (l2arc_mfuonly > 1) {
 			if (pass == 3)
 				continue;
 		}
 
 		uint64_t passed_sz = 0;
 		headroom = target_sz * l2arc_headroom;
 		if (zfs_compressed_arc_enabled)
 			headroom = (headroom * l2arc_headroom_boost) / 100;
 
 		/*
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
 		multilist_sublist_t *mls = l2arc_sublist_lock(pass);
 		ASSERT3P(mls, !=, NULL);
 		if (from_head)
 			hdr = multilist_sublist_head(mls);
 		else
 			hdr = multilist_sublist_tail(mls);
 
 		while (hdr != NULL) {
 			kmutex_t *hash_lock;
 			abd_t *to_write = NULL;
 
 			hash_lock = HDR_LOCK(hdr);
 			if (!mutex_tryenter(hash_lock)) {
 skip:
 				/* Skip this buffer rather than waiting. */
 				if (from_head)
 					hdr = multilist_sublist_next(mls, hdr);
 				else
 					hdr = multilist_sublist_prev(mls, hdr);
 				continue;
 			}
 
 			passed_sz += HDR_GET_LSIZE(hdr);
 			if (l2arc_headroom != 0 && passed_sz > headroom) {
 				/*
 				 * Searched too far.
 				 */
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			if (!l2arc_write_eligible(guid, hdr)) {
 				mutex_exit(hash_lock);
 				goto skip;
 			}
 
 			ASSERT(HDR_HAS_L1HDR(hdr));
 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
 			ASSERT3U(arc_hdr_size(hdr), >, 0);
 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
 			    HDR_HAS_RABD(hdr));
 			uint64_t psize = HDR_GET_PSIZE(hdr);
 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
 			    psize);
 
 			/*
 			 * If the allocated size of this buffer plus the max
 			 * size for the pending log block exceeds the evicted
 			 * target size, terminate writing buffers for this run.
 			 */
 			if (write_asize + asize +
 			    sizeof (l2arc_log_blk_phys_t) > target_sz) {
 				full = B_TRUE;
 				mutex_exit(hash_lock);
 				break;
 			}
 
 			/*
 			 * We should not sleep with sublist lock held or it
 			 * may block ARC eviction.  Insert a marker to save
 			 * the position and drop the lock.
 			 */
 			if (from_head) {
 				multilist_sublist_insert_after(mls, hdr,
 				    marker);
 			} else {
 				multilist_sublist_insert_before(mls, hdr,
 				    marker);
 			}
 			multilist_sublist_unlock(mls);
 
 			/*
 			 * If this header has b_rabd, we can use this since it
 			 * must always match the data exactly as it exists on
 			 * disk. Otherwise, the L2ARC can normally use the
 			 * hdr's data, but if we're sharing data between the
 			 * hdr and one of its bufs, L2ARC needs its own copy of
 			 * the data so that the ZIO below can't race with the
 			 * buf consumer. To ensure that this copy will be
 			 * available for the lifetime of the ZIO and be cleaned
 			 * up afterwards, we add it to the l2arc_free_on_write
 			 * queue. If we need to apply any transforms to the
 			 * data (compression, encryption) we will also need the
 			 * extra buffer.
 			 */
 			if (HDR_HAS_RABD(hdr) && psize == asize) {
 				to_write = hdr->b_crypt_hdr.b_rabd;
 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
 			    psize == asize) {
 				to_write = hdr->b_l1hdr.b_pabd;
 			} else {
 				int ret;
 				arc_buf_contents_t type = arc_buf_type(hdr);
 
 				ret = l2arc_apply_transforms(spa, hdr, asize,
 				    &to_write);
 				if (ret != 0) {
 					arc_hdr_clear_flags(hdr,
 					    ARC_FLAG_L2CACHE);
 					mutex_exit(hash_lock);
 					goto next;
 				}
 
 				l2arc_free_abd_on_write(to_write, asize, type);
 			}
 
 			hdr->b_l2hdr.b_dev = dev;
 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
 			hdr->b_l2hdr.b_hits = 0;
 			hdr->b_l2hdr.b_arcs_state =
 			    hdr->b_l1hdr.b_state->arcs_state;
 			/* l2arc_hdr_arcstats_update() expects a valid asize */
 			HDR_SET_L2SIZE(hdr, asize);
 			arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
 			    ARC_FLAG_L2_WRITING);
 
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(hdr), hdr);
 			l2arc_hdr_arcstats_increment(hdr);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 			mutex_enter(&dev->l2ad_mtx);
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
 				 * l2arc_write_done() can find where the
 				 * write buffers begin without searching.
 				 */
 				list_insert_head(&dev->l2ad_buflist, head);
 			}
 			list_insert_head(&dev->l2ad_buflist, hdr);
 			mutex_exit(&dev->l2ad_mtx);
 
 			boolean_t commit = l2arc_log_blk_insert(dev, hdr);
 			mutex_exit(hash_lock);
 
 			if (pio == NULL) {
 				cb = kmem_alloc(
 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
 				cb->l2wcb_dev = dev;
 				cb->l2wcb_head = head;
 				list_create(&cb->l2wcb_abd_list,
 				    sizeof (l2arc_lb_abd_buf_t),
 				    offsetof(l2arc_lb_abd_buf_t, node));
 				pio = zio_root(spa, l2arc_write_done, cb,
 				    ZIO_FLAG_CANFAIL);
 			}
 
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, asize, to_write,
 			    ZIO_CHECKSUM_OFF, NULL, hdr,
 			    ZIO_PRIORITY_ASYNC_WRITE,
 			    ZIO_FLAG_CANFAIL, B_FALSE);
 
 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
 			    zio_t *, wzio);
 			zio_nowait(wzio);
 
 			write_psize += psize;
 			write_asize += asize;
 			dev->l2ad_hand += asize;
 
 			if (commit) {
 				/* l2ad_hand will be adjusted inside. */
 				write_asize +=
 				    l2arc_log_blk_commit(dev, pio, cb);
 			}
 
 next:
 			multilist_sublist_lock(mls);
 			if (from_head)
 				hdr = multilist_sublist_next(mls, marker);
 			else
 				hdr = multilist_sublist_prev(mls, marker);
 			multilist_sublist_remove(mls, marker);
 		}
 
 		multilist_sublist_unlock(mls);
 
 		if (full == B_TRUE)
 			break;
 	}
 
 	arc_state_free_marker(marker);
 
 	/* No buffers selected for writing? */
 	if (pio == NULL) {
 		ASSERT0(write_psize);
 		ASSERT(!HDR_HAS_L1HDR(head));
 		kmem_cache_free(hdr_l2only_cache, head);
 
 		/*
 		 * Although we did not write any buffers l2ad_evict may
 		 * have advanced.
 		 */
 		if (dev->l2ad_evict != l2dhdr->dh_evict)
 			l2arc_dev_hdr_update(dev);
 
 		return (0);
 	}
 
 	if (!dev->l2ad_first)
 		ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict);
 
 	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
 
 	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
 	/*
 	 * Update the device header after the zio completes as
 	 * l2arc_write_done() may have updated the memory holding the log block
 	 * pointers in the device header.
 	 */
 	l2arc_dev_hdr_update(dev);
 
 	return (write_asize);
 }
 
 static boolean_t
 l2arc_hdr_limit_reached(void)
 {
 	int64_t s = aggsum_upper_bound(&arc_sums.arcstat_l2_hdr_size);
 
 	return (arc_reclaim_needed() ||
 	    (s > (arc_warm ? arc_c : arc_c_max) * l2arc_meta_percent / 100));
 }
 
 /*
  * This thread feeds the L2ARC at regular intervals.  This is the beating
  * heart of the L2ARC.
  */
 static  __attribute__((noreturn)) void
 l2arc_feed_thread(void *unused)
 {
 	(void) unused;
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
 	fstrans_cookie_t cookie;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	cookie = spl_fstrans_mark();
 	while (l2arc_thread_exit == 0) {
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait_idle(&l2arc_feed_thr_cv,
 		    &l2arc_feed_thr_lock, next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
 		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
 		 */
 		mutex_enter(&l2arc_dev_mtx);
 		if (l2arc_ndev == 0) {
 			mutex_exit(&l2arc_dev_mtx);
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
 		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
 		 * will return NULL if there are now no l2arc devices or if
 		 * they are all faulted.
 		 *
 		 * If a device is returned, its spa's config lock is also
 		 * held to prevent device removal.  l2arc_dev_get_next()
 		 * will grab and release l2arc_dev_mtx.
 		 */
 		if ((dev = l2arc_dev_get_next()) == NULL)
 			continue;
 
 		spa = dev->l2ad_spa;
 		ASSERT3P(spa, !=, NULL);
 
 		/*
 		 * If the pool is read-only then force the feed thread to
 		 * sleep a little longer.
 		 */
 		if (!spa_writeable(spa)) {
 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		/*
 		 * Avoid contributing to memory pressure.
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
 			spa_config_exit(spa, SCL_L2ARC, dev);
 			continue;
 		}
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
 		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
 		 */
 		l2arc_evict(dev, size, B_FALSE);
 
 		/*
 		 * Write ARC buffers.
 		 */
 		wrote = l2arc_write_buffers(spa, dev, size);
 
 		/*
 		 * Calculate interval between writes.
 		 */
 		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 	spl_fstrans_unmark(cookie);
 
 	l2arc_thread_exit = 0;
 	cv_broadcast(&l2arc_feed_thr_cv);
 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
 	thread_exit();
 }
 
 boolean_t
 l2arc_vdev_present(vdev_t *vd)
 {
 	return (l2arc_vdev_get(vd) != NULL);
 }
 
 /*
  * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
  * the vdev_t isn't an L2ARC device.
  */
 l2arc_dev_t *
 l2arc_vdev_get(vdev_t *vd)
 {
 	l2arc_dev_t	*dev;
 
 	mutex_enter(&l2arc_dev_mtx);
 	for (dev = list_head(l2arc_dev_list); dev != NULL;
 	    dev = list_next(l2arc_dev_list, dev)) {
 		if (dev->l2ad_vdev == vd)
 			break;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	return (dev);
 }
 
 static void
 l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
 {
 	l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	spa_t *spa = dev->l2ad_spa;
 
 	/*
 	 * After a l2arc_remove_vdev(), the spa_t will no longer be valid
 	 */
 	if (spa == NULL)
 		return;
 
 	/*
 	 * The L2ARC has to hold at least the payload of one log block for
 	 * them to be restored (persistent L2ARC). The payload of a log block
 	 * depends on the amount of its log entries. We always write log blocks
 	 * with 1022 entries. How many of them are committed or restored depends
 	 * on the size of the L2ARC device. Thus the maximum payload of
 	 * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
 	 * is less than that, we reduce the amount of committed and restored
 	 * log entries per block so as to enable persistence.
 	 */
 	if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
 		dev->l2ad_log_entries = 0;
 	} else {
 		dev->l2ad_log_entries = MIN((dev->l2ad_end -
 		    dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
 		    L2ARC_LOG_BLK_MAX_ENTRIES);
 	}
 
 	/*
 	 * Read the device header, if an error is returned do not rebuild L2ARC.
 	 */
 	if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
 		/*
 		 * If we are onlining a cache device (vdev_reopen) that was
 		 * still present (l2arc_vdev_present()) and rebuild is enabled,
 		 * we should evict all ARC buffers and pointers to log blocks
 		 * and reclaim their space before restoring its contents to
 		 * L2ARC.
 		 */
 		if (reopen) {
 			if (!l2arc_rebuild_enabled) {
 				return;
 			} else {
 				l2arc_evict(dev, 0, B_TRUE);
 				/* start a new log block */
 				dev->l2ad_log_ent_idx = 0;
 				dev->l2ad_log_blk_payload_asize = 0;
 				dev->l2ad_log_blk_payload_start = 0;
 			}
 		}
 		/*
 		 * Just mark the device as pending for a rebuild. We won't
 		 * be starting a rebuild in line here as it would block pool
 		 * import. Instead spa_load_impl will hand that off to an
 		 * async task which will call l2arc_spa_rebuild_start.
 		 */
 		dev->l2ad_rebuild = B_TRUE;
 	} else if (spa_writeable(spa)) {
 		/*
 		 * In this case TRIM the whole device if l2arc_trim_ahead > 0,
 		 * otherwise create a new header. We zero out the memory holding
 		 * the header to reset dh_start_lbps. If we TRIM the whole
 		 * device the new header will be written by
 		 * vdev_trim_l2arc_thread() at the end of the TRIM to update the
 		 * trim_state in the header too. When reading the header, if
 		 * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
 		 * we opt to TRIM the whole device again.
 		 */
 		if (l2arc_trim_ahead > 0) {
 			dev->l2ad_trim_all = B_TRUE;
 		} else {
 			memset(l2dhdr, 0, l2dhdr_asize);
 			l2arc_dev_hdr_update(dev);
 		}
 	}
 }
 
 /*
  * Add a vdev for use by the L2ARC.  By this point the spa has already
  * validated the vdev and opened it.
  */
 void
 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t		*adddev;
 	uint64_t		l2dhdr_asize;
 
 	ASSERT(!l2arc_vdev_present(vd));
 
 	/*
 	 * Create a new l2arc device entry.
 	 */
 	adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
 	/* leave extra size for an l2arc device header */
 	l2dhdr_asize = adddev->l2ad_dev_hdr_asize =
 	    MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift);
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
 	adddev->l2ad_trim_all = B_FALSE;
 	list_link_init(&adddev->l2ad_node);
 	adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP);
 
 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
 	/*
 	 * This is a list of all ARC buffers that are still valid on the
 	 * device.
 	 */
 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
 
 	/*
 	 * This is a list of pointers to log blocks that are still present
 	 * on the device.
 	 */
 	list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t),
 	    offsetof(l2arc_lb_ptr_buf_t, node));
 
 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 	zfs_refcount_create(&adddev->l2ad_alloc);
 	zfs_refcount_create(&adddev->l2ad_lb_asize);
 	zfs_refcount_create(&adddev->l2ad_lb_count);
 
 	/*
 	 * Decide if dev is eligible for L2ARC rebuild or whole device
 	 * trimming. This has to happen before the device is added in the
 	 * cache device list and l2arc_dev_mtx is released. Otherwise
 	 * l2arc_feed_thread() might already start writing on the
 	 * device.
 	 */
 	l2arc_rebuild_dev(adddev, B_FALSE);
 
 	/*
 	 * Add device to global list
 	 */
 	mutex_enter(&l2arc_dev_mtx);
 	list_insert_head(l2arc_dev_list, adddev);
 	atomic_inc_64(&l2arc_ndev);
 	mutex_exit(&l2arc_dev_mtx);
 }
 
 /*
  * Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
  * in case of onlining a cache device.
  */
 void
 l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
 {
 	l2arc_dev_t		*dev = NULL;
 
 	dev = l2arc_vdev_get(vd);
 	ASSERT3P(dev, !=, NULL);
 
 	/*
 	 * In contrast to l2arc_add_vdev() we do not have to worry about
 	 * l2arc_feed_thread() invalidating previous content when onlining a
 	 * cache device. The device parameters (l2ad*) are not cleared when
 	 * offlining the device and writing new buffers will not invalidate
 	 * all previous content. In worst case only buffers that have not had
 	 * their log block written to the device will be lost.
 	 * When onlining the cache device (ie offline->online without exporting
 	 * the pool in between) this happens:
 	 * vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
 	 * 			|			|
 	 * 		vdev_is_dead() = B_FALSE	l2ad_rebuild = B_TRUE
 	 * During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
 	 * is set to B_TRUE we might write additional buffers to the device.
 	 */
 	l2arc_rebuild_dev(dev, reopen);
 }
 
 typedef struct {
 	l2arc_dev_t	*rva_l2arc_dev;
 	uint64_t	rva_spa_gid;
 	uint64_t	rva_vdev_gid;
 	boolean_t	rva_async;
 
 } remove_vdev_args_t;
 
 static void
 l2arc_device_teardown(void *arg)
 {
 	remove_vdev_args_t *rva = arg;
 	l2arc_dev_t *remdev = rva->rva_l2arc_dev;
 	hrtime_t start_time = gethrtime();
 
 	/*
 	 * Clear all buflists and ARC references.  L2ARC device flush.
 	 */
 	l2arc_evict(remdev, 0, B_TRUE);
 	list_destroy(&remdev->l2ad_buflist);
 	ASSERT(list_is_empty(&remdev->l2ad_lbptr_list));
 	list_destroy(&remdev->l2ad_lbptr_list);
 	mutex_destroy(&remdev->l2ad_mtx);
 	zfs_refcount_destroy(&remdev->l2ad_alloc);
 	zfs_refcount_destroy(&remdev->l2ad_lb_asize);
 	zfs_refcount_destroy(&remdev->l2ad_lb_count);
 	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
 	vmem_free(remdev, sizeof (l2arc_dev_t));
 
 	uint64_t elaspsed = NSEC2MSEC(gethrtime() - start_time);
 	if (elaspsed > 0) {
 		zfs_dbgmsg("spa %llu, vdev %llu removed in %llu ms",
 		    (u_longlong_t)rva->rva_spa_gid,
 		    (u_longlong_t)rva->rva_vdev_gid,
 		    (u_longlong_t)elaspsed);
 	}
 
 	if (rva->rva_async)
 		arc_async_flush_remove(rva->rva_spa_gid, 2);
 	kmem_free(rva, sizeof (remove_vdev_args_t));
 }
 
 /*
  * Remove a vdev from the L2ARC.
  */
 void
 l2arc_remove_vdev(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t asynchronous = spa->spa_state == POOL_STATE_EXPORTED ||
 	    spa->spa_state == POOL_STATE_DESTROYED;
 
 	/*
 	 * Find the device by vdev
 	 */
 	l2arc_dev_t *remdev = l2arc_vdev_get(vd);
 	ASSERT3P(remdev, !=, NULL);
 
 	/*
 	 * Save info for final teardown
 	 */
 	remove_vdev_args_t *rva = kmem_alloc(sizeof (remove_vdev_args_t),
 	    KM_SLEEP);
 	rva->rva_l2arc_dev = remdev;
 	rva->rva_spa_gid = spa_load_guid(spa);
 	rva->rva_vdev_gid = remdev->l2ad_vdev->vdev_guid;
 
 	/*
 	 * Cancel any ongoing or scheduled rebuild.
 	 */
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	remdev->l2ad_rebuild_cancel = B_TRUE;
 	if (remdev->l2ad_rebuild_began == B_TRUE) {
 		while (remdev->l2ad_rebuild == B_TRUE)
 			cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock);
 	}
 	mutex_exit(&l2arc_rebuild_thr_lock);
 	rva->rva_async = asynchronous;
 
 	/*
 	 * Remove device from global list
 	 */
 	ASSERT(spa_config_held(spa, SCL_L2ARC, RW_WRITER) & SCL_L2ARC);
 	mutex_enter(&l2arc_dev_mtx);
 	list_remove(l2arc_dev_list, remdev);
 	l2arc_dev_last = NULL;		/* may have been invalidated */
 	atomic_dec_64(&l2arc_ndev);
 
 	/* During a pool export spa & vdev will no longer be valid */
 	if (asynchronous) {
 		remdev->l2ad_spa = NULL;
 		remdev->l2ad_vdev = NULL;
 	}
 	mutex_exit(&l2arc_dev_mtx);
 
 	if (!asynchronous) {
 		l2arc_device_teardown(rva);
 		return;
 	}
 
 	arc_async_flush_t *af = arc_async_flush_add(rva->rva_spa_gid, 2);
 
 	taskq_dispatch_ent(arc_flush_taskq, l2arc_device_teardown, rva,
 	    TQ_SLEEP, &af->af_tqent);
 }
 
 void
 l2arc_init(void)
 {
 	l2arc_thread_exit = 0;
 	l2arc_ndev = 0;
 
 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	l2arc_dev_list = &L2ARC_dev_list;
 	l2arc_free_on_write = &L2ARC_free_on_write;
 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
 	    offsetof(l2arc_dev_t, l2ad_node));
 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
 	    offsetof(l2arc_data_free_t, l2df_list_node));
 }
 
 void
 l2arc_fini(void)
 {
 	mutex_destroy(&l2arc_feed_thr_lock);
 	cv_destroy(&l2arc_feed_thr_cv);
 	mutex_destroy(&l2arc_rebuild_thr_lock);
 	cv_destroy(&l2arc_rebuild_thr_cv);
 	mutex_destroy(&l2arc_dev_mtx);
 	mutex_destroy(&l2arc_free_on_write_mtx);
 
 	list_destroy(l2arc_dev_list);
 	list_destroy(l2arc_free_on_write);
 }
 
 void
 l2arc_start(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
 	    TS_RUN, defclsyspri);
 }
 
 void
 l2arc_stop(void)
 {
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
 	l2arc_thread_exit = 1;
 	while (l2arc_thread_exit != 0)
 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
 	mutex_exit(&l2arc_feed_thr_lock);
 }
 
 /*
  * Punches out rebuild threads for the L2ARC devices in a spa. This should
  * be called after pool import from the spa async thread, since starting
  * these threads directly from spa_import() will make them part of the
  * "zpool import" context and delay process exit (and thus pool import).
  */
 void
 l2arc_spa_rebuild_start(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Locate the spa's l2arc devices and kick off rebuild threads.
 	 */
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL) {
 			/* Don't attempt a rebuild if the vdev is UNAVAIL */
 			continue;
 		}
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
 			dev->l2ad_rebuild_began = B_TRUE;
 			(void) thread_create(NULL, 0, l2arc_dev_rebuild_thread,
 			    dev, 0, &p0, TS_RUN, minclsyspri);
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 void
 l2arc_spa_rebuild_stop(spa_t *spa)
 {
 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
 	    spa->spa_export_thread == curthread);
 
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL)
 			continue;
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		dev->l2ad_rebuild_cancel = B_TRUE;
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		l2arc_dev_t *dev =
 		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
 		if (dev == NULL)
 			continue;
 		mutex_enter(&l2arc_rebuild_thr_lock);
 		if (dev->l2ad_rebuild_began == B_TRUE) {
 			while (dev->l2ad_rebuild == B_TRUE) {
 				cv_wait(&l2arc_rebuild_thr_cv,
 				    &l2arc_rebuild_thr_lock);
 			}
 		}
 		mutex_exit(&l2arc_rebuild_thr_lock);
 	}
 }
 
 /*
  * Main entry point for L2ARC rebuilding.
  */
 static __attribute__((noreturn)) void
 l2arc_dev_rebuild_thread(void *arg)
 {
 	l2arc_dev_t *dev = arg;
 
 	VERIFY(dev->l2ad_rebuild);
 	(void) l2arc_rebuild(dev);
 	mutex_enter(&l2arc_rebuild_thr_lock);
 	dev->l2ad_rebuild_began = B_FALSE;
 	dev->l2ad_rebuild = B_FALSE;
 	cv_signal(&l2arc_rebuild_thr_cv);
 	mutex_exit(&l2arc_rebuild_thr_lock);
 
 	thread_exit();
 }
 
 /*
  * This function implements the actual L2ARC metadata rebuild. It:
  * starts reading the log block chain and restores each block's contents
  * to memory (reconstructing arc_buf_hdr_t's).
  *
  * Operation stops under any of the following conditions:
  *
  * 1) We reach the end of the log block chain.
  * 2) We encounter *any* error condition (cksum errors, io errors)
  */
 static int
 l2arc_rebuild(l2arc_dev_t *dev)
 {
 	vdev_t			*vd = dev->l2ad_vdev;
 	spa_t			*spa = vd->vdev_spa;
 	int			err = 0;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	l2arc_log_blk_phys_t	*this_lb, *next_lb;
 	zio_t			*this_io = NULL, *next_io = NULL;
 	l2arc_log_blkptr_t	lbps[2];
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 	boolean_t		lock_held;
 
 	this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP);
 	next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP);
 
 	/*
 	 * We prevent device removal while issuing reads to the device,
 	 * then during the rebuilding phases we drop this lock again so
 	 * that a spa_unload or device remove can be initiated - this is
 	 * safe, because the spa will signal us to stop before removing
 	 * our device and wait for us to stop.
 	 */
 	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
 	lock_held = B_TRUE;
 
 	/*
 	 * Retrieve the persistent L2ARC device state.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start);
 	dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr +
 	    L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop),
 	    dev->l2ad_start);
 	dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST);
 
 	vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time;
 	vd->vdev_trim_state = l2dhdr->dh_trim_state;
 
 	/*
 	 * In case the zfs module parameter l2arc_rebuild_enabled is false
 	 * we do not start the rebuild process.
 	 */
 	if (!l2arc_rebuild_enabled)
 		goto out;
 
 	/* Prepare the rebuild process */
 	memcpy(lbps, l2dhdr->dh_start_lbps, sizeof (lbps));
 
 	/* Start the rebuild process */
 	for (;;) {
 		if (!l2arc_log_blkptr_valid(dev, &lbps[0]))
 			break;
 
 		if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1],
 		    this_lb, next_lb, this_io, &next_io)) != 0)
 			goto out;
 
 		/*
 		 * Our memory pressure valve. If the system is running low
 		 * on memory, rather than swamping memory with new ARC buf
 		 * hdrs, we opt not to rebuild the L2ARC. At this point,
 		 * however, we have already set up our L2ARC dev to chain in
 		 * new metadata log blocks, so the user may choose to offline/
 		 * online the L2ARC dev at a later time (or re-import the pool)
 		 * to reconstruct it (when there's less memory pressure).
 		 */
 		if (l2arc_hdr_limit_reached()) {
 			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
 			cmn_err(CE_NOTE, "System running low on memory, "
 			    "aborting L2ARC rebuild.");
 			err = SET_ERROR(ENOMEM);
 			goto out;
 		}
 
 		spa_config_exit(spa, SCL_L2ARC, vd);
 		lock_held = B_FALSE;
 
 		/*
 		 * Now that we know that the next_lb checks out alright, we
 		 * can start reconstruction from this log block.
 		 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 		 */
 		uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop);
 		l2arc_log_blk_restore(dev, this_lb, asize);
 
 		/*
 		 * log block restored, include its pointer in the list of
 		 * pointers to log blocks present in the L2ARC device.
 		 */
 		lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 		lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t),
 		    KM_SLEEP);
 		memcpy(lb_ptr_buf->lb_ptr, &lbps[0],
 		    sizeof (l2arc_log_blkptr_t));
 		mutex_enter(&dev->l2ad_mtx);
 		list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf);
 		ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 		ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 		zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 		zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 		mutex_exit(&dev->l2ad_mtx);
 		vdev_space_update(vd, asize, 0, 0);
 
 		/*
 		 * Protection against loops of log blocks:
 		 *
 		 *				       l2ad_hand  l2ad_evict
 		 *                                         V	      V
 		 * l2ad_start |=======================================| l2ad_end
 		 *             -----|||----|||---|||----|||
 		 *                  (3)    (2)   (1)    (0)
 		 *             ---|||---|||----|||---|||
 		 *		  (7)   (6)    (5)   (4)
 		 *
 		 * In this situation the pointer of log block (4) passes
 		 * l2arc_log_blkptr_valid() but the log block should not be
 		 * restored as it is overwritten by the payload of log block
 		 * (0). Only log blocks (0)-(3) should be restored. We check
 		 * whether l2ad_evict lies in between the payload starting
 		 * offset of the next log block (lbps[1].lbp_payload_start)
 		 * and the payload starting offset of the present log block
 		 * (lbps[0].lbp_payload_start). If true and this isn't the
 		 * first pass, we are looping from the beginning and we should
 		 * stop.
 		 */
 		if (l2arc_range_check_overlap(lbps[1].lbp_payload_start,
 		    lbps[0].lbp_payload_start, dev->l2ad_evict) &&
 		    !dev->l2ad_first)
 			goto out;
 
 		kpreempt(KPREEMPT_SYNC);
 		for (;;) {
 			mutex_enter(&l2arc_rebuild_thr_lock);
 			if (dev->l2ad_rebuild_cancel) {
 				mutex_exit(&l2arc_rebuild_thr_lock);
 				err = SET_ERROR(ECANCELED);
 				goto out;
 			}
 			mutex_exit(&l2arc_rebuild_thr_lock);
 			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
 			    RW_READER)) {
 				lock_held = B_TRUE;
 				break;
 			}
 			/*
 			 * L2ARC config lock held by somebody in writer,
 			 * possibly due to them trying to remove us. They'll
 			 * likely to want us to shut down, so after a little
 			 * delay, we check l2ad_rebuild_cancel and retry
 			 * the lock again.
 			 */
 			delay(1);
 		}
 
 		/*
 		 * Continue with the next log block.
 		 */
 		lbps[0] = lbps[1];
 		lbps[1] = this_lb->lb_prev_lbp;
 		PTR_SWAP(this_lb, next_lb);
 		this_io = next_io;
 		next_io = NULL;
 	}
 
 	if (this_io != NULL)
 		l2arc_log_blk_fetch_abort(this_io);
 out:
 	if (next_io != NULL)
 		l2arc_log_blk_fetch_abort(next_io);
 	vmem_free(this_lb, sizeof (*this_lb));
 	vmem_free(next_lb, sizeof (*next_lb));
 
 	if (err == ECANCELED) {
 		/*
 		 * In case the rebuild was canceled do not log to spa history
 		 * log as the pool may be in the process of being removed.
 		 */
 		zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 		return (err);
 	} else if (!l2arc_rebuild_enabled) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "disabled");
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_success);
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "successful, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	} else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) {
 		/*
 		 * No error but also nothing restored, meaning the lbps array
 		 * in the device header points to invalid/non-present log
 		 * blocks. Reset the header.
 		 */
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "no valid log blocks");
 		memset(l2dhdr, 0, dev->l2ad_dev_hdr_asize);
 		l2arc_dev_hdr_update(dev);
 	} else if (err != 0) {
 		spa_history_log_internal(spa, "L2ARC rebuild", NULL,
 		    "aborted, restored %llu blocks",
 		    (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count));
 	}
 
 	if (lock_held)
 		spa_config_exit(spa, SCL_L2ARC, vd);
 
 	return (err);
 }
 
 /*
  * Attempts to read the device header on the provided L2ARC device and writes
  * it to `hdr'. On success, this function returns 0, otherwise the appropriate
  * error code is returned.
  */
 static int
 l2arc_dev_hdr_read(l2arc_dev_t *dev)
 {
 	int			err;
 	uint64_t		guid;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t 			*abd;
 
 	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd,
 	    ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
 	    ZIO_FLAG_SPECULATIVE, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		return (err);
 	}
 
 	if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
 		byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr));
 
 	if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC ||
 	    l2dhdr->dh_spa_guid != guid ||
 	    l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid ||
 	    l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION ||
 	    l2dhdr->dh_log_entries != dev->l2ad_log_entries ||
 	    l2dhdr->dh_end != dev->l2ad_end ||
 	    !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end,
 	    l2dhdr->dh_evict) ||
 	    (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE &&
 	    l2arc_trim_ahead > 0)) {
 		/*
 		 * Attempt to rebuild a device containing no actual dev hdr
 		 * or containing a header from some other pool or from another
 		 * version of persistent L2ARC.
 		 */
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	return (0);
 }
 
 /*
  * Reads L2ARC log blocks from storage and validates their contents.
  *
  * This function implements a simple fetcher to make sure that while
  * we're processing one buffer the L2ARC is already fetching the next
  * one in the chain.
  *
  * The arguments this_lp and next_lp point to the current and next log block
  * address in the block chain. Similarly, this_lb and next_lb hold the
  * l2arc_log_blk_phys_t's of the current and next L2ARC blk.
  *
  * The `this_io' and `next_io' arguments are used for block fetching.
  * When issuing the first blk IO during rebuild, you should pass NULL for
  * `this_io'. This function will then issue a sync IO to read the block and
  * also issue an async IO to fetch the next block in the block chain. The
  * fetched IO is returned in `next_io'. On subsequent calls to this
  * function, pass the value returned in `next_io' from the previous call
  * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO.
  * Prior to the call, you should initialize your `next_io' pointer to be
  * NULL. If no fetch IO was issued, the pointer is left set at NULL.
  *
  * On success, this function returns 0, otherwise it returns an appropriate
  * error code. On error the fetching IO is aborted and cleared before
  * returning from this function. Therefore, if we return `success', the
  * caller can assume that we have taken care of cleanup of fetch IOs.
  */
 static int
 l2arc_log_blk_read(l2arc_dev_t *dev,
     const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
     l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
     zio_t *this_io, zio_t **next_io)
 {
 	int		err = 0;
 	zio_cksum_t	cksum;
 	uint64_t	asize;
 
 	ASSERT(this_lbp != NULL && next_lbp != NULL);
 	ASSERT(this_lb != NULL && next_lb != NULL);
 	ASSERT(next_io != NULL && *next_io == NULL);
 	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
 
 	/*
 	 * Check to see if we have issued the IO for this log block in a
 	 * previous run. If not, this is the first call, so issue it now.
 	 */
 	if (this_io == NULL) {
 		this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp,
 		    this_lb);
 	}
 
 	/*
 	 * Peek to see if we can start issuing the next IO immediately.
 	 */
 	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
 		/*
 		 * Start issuing IO for the next log block early - this
 		 * should help keep the L2ARC device busy while we
 		 * decompress and restore this log block.
 		 */
 		*next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp,
 		    next_lb);
 	}
 
 	/* Wait for the IO to read this log block to complete */
 	if ((err = zio_wait(this_io)) != 0) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
 		zfs_dbgmsg("L2ARC IO error (%d) while reading log block, "
 		    "offset: %llu, vdev guid: %llu", err,
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 		goto cleanup;
 	}
 
 	/*
 	 * Make sure the buffer checks out.
 	 * L2BLK_GET_PSIZE returns aligned size for log blocks.
 	 */
 	asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop);
 	fletcher_4_native(this_lb, asize, NULL, &cksum);
 	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
 		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors);
 		zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, "
 		    "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu",
 		    (u_longlong_t)this_lbp->lbp_daddr,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid,
 		    (u_longlong_t)dev->l2ad_hand,
 		    (u_longlong_t)dev->l2ad_evict);
 		err = SET_ERROR(ECKSUM);
 		goto cleanup;
 	}
 
 	/* Now we can take our time decoding this buffer */
 	switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) {
 	case ZIO_COMPRESS_OFF:
 		break;
 	case ZIO_COMPRESS_LZ4: {
 		abd_t *abd = abd_alloc_linear(asize, B_TRUE);
 		abd_copy_from_buf_off(abd, this_lb, 0, asize);
 		abd_t dabd;
 		abd_get_from_buf_struct(&dabd, this_lb, sizeof (*this_lb));
 		err = zio_decompress_data(
 		    L2BLK_GET_COMPRESS((this_lbp)->lbp_prop),
 		    abd, &dabd, asize, sizeof (*this_lb), NULL);
 		abd_free(&dabd);
 		abd_free(abd);
 		if (err != 0) {
 			err = SET_ERROR(EINVAL);
 			goto cleanup;
 		}
 		break;
 	}
 	default:
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
 		byteswap_uint64_array(this_lb, sizeof (*this_lb));
 	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
 		err = SET_ERROR(EINVAL);
 		goto cleanup;
 	}
 cleanup:
 	/* Abort an in-flight fetch I/O in case of error */
 	if (err != 0 && *next_io != NULL) {
 		l2arc_log_blk_fetch_abort(*next_io);
 		*next_io = NULL;
 	}
 	return (err);
 }
 
 /*
  * Restores the payload of a log block to ARC. This creates empty ARC hdr
  * entries which only contain an l2arc hdr, essentially restoring the
  * buffers to their L2ARC evicted state. This function also updates space
  * usage on the L2ARC vdev to make sure it tracks restored buffers.
  */
 static void
 l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb,
     uint64_t lb_asize)
 {
 	uint64_t	size = 0, asize = 0;
 	uint64_t	log_entries = dev->l2ad_log_entries;
 
 	/*
 	 * Usually arc_adapt() is called only for data, not headers, but
 	 * since we may allocate significant amount of memory here, let ARC
 	 * grow its arc_c.
 	 */
 	arc_adapt(log_entries * HDR_L2ONLY_SIZE);
 
 	for (int i = log_entries - 1; i >= 0; i--) {
 		/*
 		 * Restore goes in the reverse temporal direction to preserve
 		 * correct temporal ordering of buffers in the l2ad_buflist.
 		 * l2arc_hdr_restore also does a list_insert_tail instead of
 		 * list_insert_head on the l2ad_buflist:
 		 *
 		 *		LIST	l2ad_buflist		LIST
 		 *		HEAD  <------ (time) ------	TAIL
 		 * direction	+-----+-----+-----+-----+-----+    direction
 		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
 		 * fill		+-----+-----+-----+-----+-----+
 		 *		^				^
 		 *		|				|
 		 *		|				|
 		 *	l2arc_feed_thread		l2arc_rebuild
 		 *	will place new bufs here	restores bufs here
 		 *
 		 * During l2arc_rebuild() the device is not used by
 		 * l2arc_feed_thread() as dev->l2ad_rebuild is set to true.
 		 */
 		size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop);
 		asize += vdev_psize_to_asize(dev->l2ad_vdev,
 		    L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop));
 		l2arc_hdr_restore(&lb->lb_entries[i], dev);
 	}
 
 	/*
 	 * Record rebuild stats:
 	 *	size		Logical size of restored buffers in the L2ARC
 	 *	asize		Aligned size of restored buffers in the L2ARC
 	 */
 	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
 	ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize);
 	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize);
 	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
 }
 
 /*
  * Restores a single ARC buf hdr from a log entry. The ARC buffer is put
  * into a state indicating that it has been evicted to L2ARC.
  */
 static void
 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev)
 {
 	arc_buf_hdr_t		*hdr, *exists;
 	kmutex_t		*hash_lock;
 	arc_buf_contents_t	type = L2BLK_GET_TYPE((le)->le_prop);
 	uint64_t		asize = vdev_psize_to_asize(dev->l2ad_vdev,
 	    L2BLK_GET_PSIZE((le)->le_prop));
 
 	/*
 	 * Do all the allocation before grabbing any locks, this lets us
 	 * sleep if memory is full and we don't have to deal with failed
 	 * allocations.
 	 */
 	hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type,
 	    dev, le->le_dva, le->le_daddr,
 	    L2BLK_GET_PSIZE((le)->le_prop), asize, le->le_birth,
 	    L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel,
 	    L2BLK_GET_PROTECTED((le)->le_prop),
 	    L2BLK_GET_PREFETCH((le)->le_prop),
 	    L2BLK_GET_STATE((le)->le_prop));
 
 	/*
 	 * vdev_space_update() has to be called before arc_hdr_destroy() to
 	 * avoid underflow since the latter also calls vdev_space_update().
 	 */
 	l2arc_hdr_arcstats_increment(hdr);
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_tail(&dev->l2ad_buflist, hdr);
 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
 	mutex_exit(&dev->l2ad_mtx);
 
 	exists = buf_hash_insert(hdr, &hash_lock);
 	if (exists) {
 		/* Buffer was already cached, no need to restore it. */
 		arc_hdr_destroy(hdr);
 		/*
 		 * If the buffer is already cached, check whether it has
 		 * L2ARC metadata. If not, enter them and update the flag.
 		 * This is important is case of onlining a cache device, since
 		 * we previously evicted all L2ARC metadata from ARC.
 		 */
 		if (!HDR_HAS_L2HDR(exists)) {
 			arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR);
 			exists->b_l2hdr.b_dev = dev;
 			exists->b_l2hdr.b_daddr = le->le_daddr;
 			exists->b_l2hdr.b_arcs_state =
 			    L2BLK_GET_STATE((le)->le_prop);
 			/* l2arc_hdr_arcstats_update() expects a valid asize */
 			HDR_SET_L2SIZE(exists, asize);
 			mutex_enter(&dev->l2ad_mtx);
 			list_insert_tail(&dev->l2ad_buflist, exists);
 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
 			    arc_hdr_size(exists), exists);
 			mutex_exit(&dev->l2ad_mtx);
 			l2arc_hdr_arcstats_increment(exists);
 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 		}
 		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
 	}
 
 	mutex_exit(hash_lock);
 }
 
 /*
  * Starts an asynchronous read IO to read a log block. This is used in log
  * block reconstruction to start reading the next block before we are done
  * decoding and reconstructing the current block, to keep the l2arc device
  * nice and hot with read IO to process.
  * The returned zio will contain a newly allocated memory buffers for the IO
  * data which should then be freed by the caller once the zio is no longer
  * needed (i.e. due to it having completed). If you wish to abort this
  * zio, you should do so using l2arc_log_blk_fetch_abort, which takes
  * care of disposing of the allocated buffers correctly.
  */
 static zio_t *
 l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
     l2arc_log_blk_phys_t *lb)
 {
 	uint32_t		asize;
 	zio_t			*pio;
 	l2arc_read_callback_t	*cb;
 
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	ASSERT(asize <= sizeof (l2arc_log_blk_phys_t));
 
 	cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP);
 	cb->l2rcb_abd = abd_get_from_buf(lb, asize);
 	pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
 	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize,
 	    cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
 
 	return (pio);
 }
 
 /*
  * Aborts a zio returned from l2arc_log_blk_fetch and frees the data
  * buffers allocated for it.
  */
 static void
 l2arc_log_blk_fetch_abort(zio_t *zio)
 {
 	(void) zio_wait(zio);
 }
 
 /*
  * Creates a zio to update the device header on an l2arc device.
  */
 void
 l2arc_dev_hdr_update(l2arc_dev_t *dev)
 {
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	const uint64_t		l2dhdr_asize = dev->l2ad_dev_hdr_asize;
 	abd_t			*abd;
 	int			err;
 
 	VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER));
 
 	l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
 	l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION;
 	l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
 	l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid;
 	l2dhdr->dh_log_entries = dev->l2ad_log_entries;
 	l2dhdr->dh_evict = dev->l2ad_evict;
 	l2dhdr->dh_start = dev->l2ad_start;
 	l2dhdr->dh_end = dev->l2ad_end;
 	l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize);
 	l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count);
 	l2dhdr->dh_flags = 0;
 	l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time;
 	l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state;
 	if (dev->l2ad_first)
 		l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
 
 	abd = abd_get_from_buf(l2dhdr, l2dhdr_asize);
 
 	err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev,
 	    VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL,
 	    NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE));
 
 	abd_free(abd);
 
 	if (err != 0) {
 		zfs_dbgmsg("L2ARC IO error (%d) while writing device header, "
 		    "vdev guid: %llu", err,
 		    (u_longlong_t)dev->l2ad_vdev->vdev_guid);
 	}
 }
 
 /*
  * Commits a log block to the L2ARC device. This routine is invoked from
  * l2arc_write_buffers when the log block fills up.
  * This function allocates some memory to temporarily hold the serialized
  * buffer to be written. This is then released in l2arc_write_done.
  */
 static uint64_t
 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_dev_hdr_phys_t	*l2dhdr = dev->l2ad_dev_hdr;
 	uint64_t		psize, asize;
 	zio_t			*wzio;
 	l2arc_lb_abd_buf_t	*abd_buf;
 	abd_t			*abd = NULL;
 	l2arc_lb_ptr_buf_t	*lb_ptr_buf;
 
 	VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries);
 
 	abd_buf = zio_buf_alloc(sizeof (*abd_buf));
 	abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb));
 	lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP);
 	lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP);
 
 	/* link the buffer into the block chain */
 	lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1];
 	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
 
 	/*
 	 * l2arc_log_blk_commit() may be called multiple times during a single
 	 * l2arc_write_buffers() call. Save the allocated abd buffers in a list
 	 * so we can free them in l2arc_write_done() later on.
 	 */
 	list_insert_tail(&cb->l2wcb_abd_list, abd_buf);
 
 	/* try to compress the buffer, at least one sector to save */
 	psize = zio_compress_data(ZIO_COMPRESS_LZ4,
 	    abd_buf->abd, &abd, sizeof (*lb),
 	    zio_get_compression_max_size(ZIO_COMPRESS_LZ4,
 	    dev->l2ad_vdev->vdev_ashift,
 	    dev->l2ad_vdev->vdev_ashift, sizeof (*lb)), 0);
 
 	/* a log block is never entirely zero */
 	ASSERT(psize != 0);
 	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
 	ASSERT(asize <= sizeof (*lb));
 
 	/*
 	 * Update the start log block pointer in the device header to point
 	 * to the log block we're about to write.
 	 */
 	l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0];
 	l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
 	l2dhdr->dh_start_lbps[0].lbp_payload_asize =
 	    dev->l2ad_log_blk_payload_asize;
 	l2dhdr->dh_start_lbps[0].lbp_payload_start =
 	    dev->l2ad_log_blk_payload_start;
 	L2BLK_SET_LSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb));
 	L2BLK_SET_PSIZE(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize);
 	L2BLK_SET_CHECKSUM(
 	    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 	    ZIO_CHECKSUM_FLETCHER_4);
 	if (asize < sizeof (*lb)) {
 		/* compression succeeded */
 		abd_zero_off(abd, psize, asize - psize);
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_LZ4);
 	} else {
 		/* compression failed */
 		abd_copy_from_buf_off(abd, lb, 0, sizeof (*lb));
 		L2BLK_SET_COMPRESS(
 		    (&l2dhdr->dh_start_lbps[0])->lbp_prop,
 		    ZIO_COMPRESS_OFF);
 	}
 
 	/* checksum what we're about to write */
 	abd_fletcher_4_native(abd, asize, NULL,
 	    &l2dhdr->dh_start_lbps[0].lbp_cksum);
 
 	abd_free(abd_buf->abd);
 
 	/* perform the write itself */
 	abd_buf->abd = abd;
 	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
 	    asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
 	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
 	(void) zio_nowait(wzio);
 
 	dev->l2ad_hand += asize;
 	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
 	/*
 	 * Include the committed log block's pointer  in the list of pointers
 	 * to log blocks present in the L2ARC device.
 	 */
 	memcpy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[0],
 	    sizeof (l2arc_log_blkptr_t));
 	mutex_enter(&dev->l2ad_mtx);
 	list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf);
 	ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_count);
 	zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf);
 	zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf);
 	mutex_exit(&dev->l2ad_mtx);
 
 	/* bump the kstats */
 	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
 	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
 	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize);
 	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
 	    dev->l2ad_log_blk_payload_asize / asize);
 
 	/* start a new log block */
 	dev->l2ad_log_ent_idx = 0;
 	dev->l2ad_log_blk_payload_asize = 0;
 	dev->l2ad_log_blk_payload_start = 0;
 
 	return (asize);
 }
 
 /*
  * Validates an L2ARC log block address to make sure that it can be read
  * from the provided L2ARC device.
  */
 boolean_t
 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
 {
 	/* L2BLK_GET_PSIZE returns aligned size for log blocks */
 	uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop);
 	uint64_t end = lbp->lbp_daddr + asize - 1;
 	uint64_t start = lbp->lbp_payload_start;
 	boolean_t evicted = B_FALSE;
 
 	/*
 	 * A log block is valid if all of the following conditions are true:
 	 * - it fits entirely (including its payload) between l2ad_start and
 	 *   l2ad_end
 	 * - it has a valid size
 	 * - neither the log block itself nor part of its payload was evicted
 	 *   by l2arc_evict():
 	 *
 	 *		l2ad_hand          l2ad_evict
 	 *		|			 |	lbp_daddr
 	 *		|     start		 |	|  end
 	 *		|     |			 |	|  |
 	 *		V     V		         V	V  V
 	 *   l2ad_start ============================================ l2ad_end
 	 *                    --------------------------||||
 	 *				^		 ^
 	 *				|		log block
 	 *				payload
 	 */
 
 	evicted =
 	    l2arc_range_check_overlap(start, end, dev->l2ad_hand) ||
 	    l2arc_range_check_overlap(start, end, dev->l2ad_evict) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) ||
 	    l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end);
 
 	return (start >= dev->l2ad_start && end <= dev->l2ad_end &&
 	    asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) &&
 	    (!evicted || dev->l2ad_first));
 }
 
 /*
  * Inserts ARC buffer header `hdr' into the current L2ARC log block on
  * the device. The buffer being inserted must be present in L2ARC.
  * Returns B_TRUE if the L2ARC log block is full and needs to be committed
  * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
  */
 static boolean_t
 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
 {
 	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
 	l2arc_log_ent_phys_t	*le;
 
 	if (dev->l2ad_log_entries == 0)
 		return (B_FALSE);
 
 	int index = dev->l2ad_log_ent_idx++;
 
 	ASSERT3S(index, <, dev->l2ad_log_entries);
 	ASSERT(HDR_HAS_L2HDR(hdr));
 
 	le = &lb->lb_entries[index];
 	memset(le, 0, sizeof (*le));
 	le->le_dva = hdr->b_dva;
 	le->le_birth = hdr->b_birth;
 	le->le_daddr = hdr->b_l2hdr.b_daddr;
 	if (index == 0)
 		dev->l2ad_log_blk_payload_start = le->le_daddr;
 	L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr));
 	L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr));
 	L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr));
 	le->le_complevel = hdr->b_complevel;
 	L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
 	L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
 	L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
 	L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
 	dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
 	    HDR_GET_PSIZE(hdr));
 
 	return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries);
 }
 
 /*
  * Checks whether a given L2ARC device address sits in a time-sequential
  * range. The trick here is that the L2ARC is a rotary buffer, so we can't
  * just do a range comparison, we need to handle the situation in which the
  * range wraps around the end of the L2ARC device. Arguments:
  *	bottom -- Lower end of the range to check (written to earlier).
  *	top    -- Upper end of the range to check (written to later).
  *	check  -- The address for which we want to determine if it sits in
  *		  between the top and bottom.
  *
  * The 3-way conditional below represents the following cases:
  *
  *	bottom < top : Sequentially ordered case:
  *	  <check>--------+-------------------+
  *	                 |  (overlap here?)  |
  *	 L2ARC dev       V                   V
  *	 |---------------<bottom>============<top>--------------|
  *
  *	bottom > top: Looped-around case:
  *	                      <check>--------+------------------+
  *	                                     |  (overlap here?) |
  *	 L2ARC dev                           V                  V
  *	 |===============<top>---------------<bottom>===========|
  *	 ^               ^
  *	 |  (or here?)   |
  *	 +---------------+---------<check>
  *
  *	top == bottom : Just a single address comparison.
  */
 boolean_t
 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
 {
 	if (bottom < top)
 		return (bottom <= check && check <= top);
 	else if (bottom > top)
 		return (check <= top || bottom <= check);
 	else
 		return (check == top);
 }
 
 EXPORT_SYMBOL(arc_buf_size);
 EXPORT_SYMBOL(arc_write);
 EXPORT_SYMBOL(arc_read);
 EXPORT_SYMBOL(arc_buf_info);
 EXPORT_SYMBOL(arc_getbuf_func);
 EXPORT_SYMBOL(arc_add_prune_callback);
 EXPORT_SYMBOL(arc_remove_prune_callback);
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_min,
 	spl_param_get_u64, ZMOD_RW, "Minimum ARC size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_max,
 	spl_param_get_u64, ZMOD_RW, "Maximum ARC size in bytes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_balance, UINT, ZMOD_RW,
 	"Balance between metadata and data on ghost hits.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Seconds before growing ARC size");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "log2(fraction of ARC to reclaim)");
 
 #ifdef _KERNEL
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW,
 	"Percent of pagecache to reclaim ARC to");
 #endif
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, UINT, ZMOD_RD,
 	"Target average block size");
 
 ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW,
 	"Disable compressed ARC buffers");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "Min life of prefetch block in ms");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Min life of prescient prefetched block in ms");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, U64, ZMOD_RW,
 	"Max write bytes per interval");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, U64, ZMOD_RW,
 	"Extra write bytes during device warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, U64, ZMOD_RW,
 	"Number of max device writes to precache");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, U64, ZMOD_RW,
 	"Compressed l2arc_headroom multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, U64, ZMOD_RW,
 	"TRIM ahead L2ARC write size multiplier");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, U64, ZMOD_RW,
 	"Seconds between L2ARC writing");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, U64, ZMOD_RW,
 	"Min feed interval in milliseconds");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW,
 	"Skip caching prefetched buffers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW,
 	"Turbo L2ARC warmup");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW,
 	"No reads during writes");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, meta_percent, UINT, ZMOD_RW,
 	"Percent of ARC size allowed for L2ARC-only headers");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW,
 	"Rebuild the L2ARC when importing a pool");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, U64, ZMOD_RW,
 	"Min size in bytes to write rebuild log blocks in L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, mfuonly, INT, ZMOD_RW,
 	"Cache only MFU data from ARC into L2ARC");
 
 ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, exclude_special, INT, ZMOD_RW,
 	"Exclude dbufs on special vdevs from being cached to L2ARC if set.");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int,
 	param_get_uint, ZMOD_RW, "System free memory I/O throttle in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "System free memory target size in bytes");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_u64,
 	spl_param_get_u64, ZMOD_RW, "Minimum bytes of dnodes in ARC");
 
 ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent,
     param_set_arc_int, param_get_uint, ZMOD_RW,
 	"Percent of ARC meta buffers for dnodes");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, UINT, ZMOD_RW,
 	"Percentage of excess dnodes to try to unpin");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, UINT, ZMOD_RW,
 	"When full, ARC allocation waits for eviction of this % of alloc size");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_batch_limit, UINT, ZMOD_RW,
 	"The number of headers to evict per sublist before moving to the next");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, prune_task_threads, INT, ZMOD_RW,
 	"Number of arc_prune threads");
 
 ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, evict_threads, UINT, ZMOD_RD,
 	"Number of threads to use for ARC eviction.");
diff --git a/sys/contrib/openzfs/module/zfs/ddt.c b/sys/contrib/openzfs/module/zfs/ddt.c
index d6658375f810..0dc9adc7fd4f 100644
--- a/sys/contrib/openzfs/module/zfs/ddt.c
+++ b/sys/contrib/openzfs/module/zfs/ddt.c
@@ -1,2825 +1,2827 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2022 by Pawel Jakub Dawidek
  * Copyright (c) 2019, 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/ddt.h>
 #include <sys/ddt_impl.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include <sys/arc.h>
 #include <sys/dsl_pool.h>
 #include <sys/zio_checksum.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
 #include <sys/zfeature.h>
 
 /*
  * # DDT: Deduplication tables
  *
  * The dedup subsystem provides block-level deduplication. When enabled, blocks
  * to be written will have the dedup (D) bit set, which causes them to be
  * tracked in a "dedup table", or DDT. If a block has been seen before (exists
  * in the DDT), instead of being written, it will instead be made to reference
  * the existing on-disk data, and a refcount bumped in the DDT instead.
  *
  * ## Dedup tables and entries
  *
  * Conceptually, a DDT is a dictionary or map. Each entry has a "key"
  * (ddt_key_t) made up a block's checksum and certian properties, and a "value"
  * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth
  * time and refcount. Together these are enough to track references to a
  * specific block, to build a valid block pointer to reference that block (for
  * freeing, scrubbing, etc), and to fill a new block pointer with the missing
  * pieces to make it seem like it was written.
  *
  * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[].
  * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk
  * object data formats, each with their own implementations) and "classes"
  * (ddt_class_t, instance of a storage type object, for entries with a specific
  * characteristic). An entry (key) will only ever exist on one of these objects
  * at any given time, but may be moved from one to another if their type or
  * class changes.
  *
  * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block
  * is to be written, before DVAs have been allocated, ddt_lookup() is called to
  * see if the block has been seen before. If its not found, the write proceeds
  * as normal, and after it succeeds, a new entry is created. If it is found, we
  * fill the BP with the DVAs from the entry, increment the refcount and cause
  * the write IO to return immediately.
  *
  * Traditionally, each ddt_phys_t slot in the entry represents a separate dedup
  * block for the same content/checksum. The slot is selected based on the
  * zp_copies parameter the block is written with, that is, the number of DVAs
  * in the block. The "ditto" slot (DDT_PHYS_DITTO) used to be used for
  * now-removed "dedupditto" feature. These are no longer written, and will be
  * freed if encountered on old pools.
  *
  * If the "fast_dedup" feature is enabled, new dedup tables will be created
  * with the "flat phys" option. In this mode, there is only one ddt_phys_t
  * slot. If a write is issued for an entry that exists, but has fewer DVAs,
  * then only as many new DVAs are allocated and written to make up the
  * shortfall. The existing entry is then extended (ddt_phys_extend()) with the
  * new DVAs.
  *
  * ## Lifetime of an entry
  *
  * A DDT can be enormous, and typically is not held in memory all at once.
  * Instead, the changes to an entry are tracked in memory, and written down to
  * disk at the end of each txg.
  *
  * A "live" in-memory entry (ddt_entry_t) is a node on the live tree
  * (ddt_tree).  At the start of a txg, ddt_tree is empty. When an entry is
  * required for IO, ddt_lookup() is called. If an entry already exists on
  * ddt_tree, it is returned. Otherwise, a new one is created, and the
  * type/class objects for the DDT are searched for that key. If its found, its
  * value is copied into the live entry. If not, an empty entry is created.
  *
  * The live entry will be modified during the txg, usually by modifying the
  * refcount, but sometimes by adding or updating DVAs. At the end of the txg
  * (during spa_sync()), type and class are recalculated for entry (see
  * ddt_sync_entry()), and the entry is written to the appropriate storage
  * object and (if necessary), removed from an old one. ddt_tree is cleared and
  * the next txg can start.
  *
  * ## Dedup quota
  *
  * A maximum size for all DDTs on the pool can be set with the
  * dedup_table_quota property. This is determined in ddt_over_quota() and
  * enforced during ddt_lookup(). If the pool is at or over its quota limit,
  * ddt_lookup() will only return entries for existing blocks, as updates are
  * still possible. New entries will not be created; instead, ddt_lookup() will
  * return NULL. In response, the DDT write stage (zio_ddt_write()) will remove
  * the D bit on the block and reissue the IO as a regular write. The block will
  * not be deduplicated.
  *
  * Note that this is based on the on-disk size of the dedup store. Reclaiming
  * this space after deleting entries relies on the ZAP "shrinking" behaviour,
  * without which, no space would be recovered and the DDT would continue to be
  * considered "over quota". See zap_shrink_enabled.
  *
  * ## Dedup table pruning
  *
  * As a complement to the dedup quota feature, ddtprune allows removal of older
  * non-duplicate entries to make room for newer duplicate entries. The amount
  * to prune can be based on a target percentage of the unique entries or based
  * on the age (i.e., prune unique entry older than N days).
  *
  * ## Dedup log
  *
  * Historically, all entries modified on a txg were written back to dedup
  * storage objects at the end of every txg. This could cause significant
  * overheads, as each entry only takes up a tiny portion of a ZAP leaf node,
  * and so required reading the whole node, updating the entry, and writing it
  * back. On busy pools, this could add serious IO and memory overheads.
  *
  * To address this, the dedup log was added. If the "fast_dedup" feature is
  * enabled, at the end of each txg, modified entries will be copied to an
  * in-memory "log" object (ddt_log_t), and appended to an on-disk log. If the
  * same block is requested again, the in-memory object will be checked first,
  * and if its there, the entry inflated back onto the live tree without going
  * to storage. The on-disk log is only read at pool import time, to reload the
  * in-memory log.
  *
  * Each txg, some amount of the in-memory log will be flushed out to a DDT
  * storage object (ie ZAP) as normal. OpenZFS will try hard to flush enough to
  * keep up with the rate of change on dedup entries, but not so much that it
  * would impact overall throughput, and not using too much memory. See the
  * zfs_dedup_log_* tunables in zfs(4) for more details.
  *
  * ## Repair IO
  *
  * If a read on a dedup block fails, but there are other copies of the block in
  * the other ddt_phys_t slots, reads will be issued for those instead
  * (zio_ddt_read_start()). If one of those succeeds, the read is returned to
  * the caller, and a copy is stashed on the entry's dde_repair_abd.
  *
  * During the end-of-txg sync, any entries with a dde_repair_abd get a
  * "rewrite" write issued for the original block pointer, with the data read
  * from the alternate block. If the block is actually damaged, this will invoke
  * the pool's "self-healing" mechanism, and repair the block.
  *
  * If the "fast_dedup" feature is enabled, the "flat phys" option will be in
  * use, so there is only ever one ddt_phys_t slot. The repair process will
  * still happen in this case, though it is unlikely to succeed as there will
  * usually be no other equivalent blocks to fall back on (though there might
  * be, if this was an early version of a dedup'd block that has since been
  * extended).
  *
  * Note that this repair mechanism is in addition to and separate from the
  * regular OpenZFS scrub and self-healing mechanisms.
  *
  * ## Scanning (scrub/resilver)
  *
  * If dedup is active, the scrub machinery will walk the dedup table first, and
  * scrub all blocks with refcnt > 1 first. After that it will move on to the
  * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them.
  * In this way, heavily deduplicated blocks are only scrubbed once. See the
  * commentary on dsl_scan_ddt() for more details.
  *
  * Walking the DDT is done via ddt_walk(). The current position is stored in a
  * ddt_bookmark_t, which represents a stable position in the storage object.
  * This bookmark is stored by the scan machinery, and must reference the same
  * position on the object even if the object changes, the pool is exported, or
  * OpenZFS is upgraded.
  *
  * If the "fast_dedup" feature is enabled and the table has a log, the scan
  * cannot begin until entries on the log are flushed, as the on-disk log has no
  * concept of a "stable position". Instead, the log flushing process will enter
  * a more aggressive mode, to flush out as much as is necesary as soon as
  * possible, in order to begin the scan as soon as possible.
  *
  * ## Interaction with block cloning
  *
  * If block cloning and dedup are both enabled on a pool, BRT will look for the
  * dedup bit on an incoming block pointer. If set, it will call into the DDT
  * (ddt_addref()) to add a reference to the block, instead of adding a
  * reference to the BRT. See brt_pending_apply().
  */
 
 /*
  * These are the only checksums valid for dedup. They must match the list
  * from dedup_table in zfs_prop.c
  */
 #define	DDT_CHECKSUM_VALID(c)	\
 	(c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \
 	c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \
 	c == ZIO_CHECKSUM_BLAKE3)
 
 static kmem_cache_t *ddt_cache;
 
 static kmem_cache_t *ddt_entry_flat_cache;
 static kmem_cache_t *ddt_entry_trad_cache;
 
 #define	DDT_ENTRY_FLAT_SIZE	(sizeof (ddt_entry_t) + DDT_FLAT_PHYS_SIZE)
 #define	DDT_ENTRY_TRAD_SIZE	(sizeof (ddt_entry_t) + DDT_TRAD_PHYS_SIZE)
 
 #define	DDT_ENTRY_SIZE(ddt)	\
 	_DDT_PHYS_SWITCH(ddt, DDT_ENTRY_FLAT_SIZE, DDT_ENTRY_TRAD_SIZE)
 
 /*
  * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
  */
 int zfs_dedup_prefetch = 0;
 
 /*
  * If the dedup class cannot satisfy a DDT allocation, treat as over quota
  * for this many TXGs.
  */
 uint_t dedup_class_wait_txgs = 5;
 
 /*
  * How many DDT prune entries to add to the DDT sync AVL tree.
  * Note these addtional entries have a memory footprint of a
  * ddt_entry_t (216 bytes).
  */
 static uint32_t zfs_ddt_prunes_per_txg = 50000;
 
 /*
  * For testing, synthesize aged DDT entries
  * (in global scope for ztest)
  */
 boolean_t ddt_prune_artificial_age = B_FALSE;
 boolean_t ddt_dump_prune_histogram = B_FALSE;
 
 /*
  * Minimum time to flush per txg.
  */
 uint_t zfs_dedup_log_flush_min_time_ms = 1000;
 
 /*
  * Minimum entries to flush per txg.
  */
 uint_t zfs_dedup_log_flush_entries_min = 200;
 
 /*
  * Target number of TXGs until the whole dedup log has been flushed.
  * The log size will float around this value times the ingest rate.
  */
 uint_t zfs_dedup_log_flush_txgs = 100;
 
 /*
  * Maximum entries to flush per txg. Used for testing the dedup log.
  */
 uint_t zfs_dedup_log_flush_entries_max = UINT_MAX;
 
 /*
  * Soft cap for the size of the current dedup log. If the log is larger
  * than this size, we slightly increase the aggressiveness of the flushing to
  * try to bring it back down to the soft cap.
  */
 uint_t zfs_dedup_log_cap = UINT_MAX;
 
 /*
  * If this is set to B_TRUE, the cap above acts more like a hard cap:
  * flushing is significantly more aggressive, increasing the minimum amount we
  * flush per txg, as well as the maximum.
  */
 boolean_t zfs_dedup_log_hard_cap = B_FALSE;
 
 /*
  * Number of txgs to average flow rates across.
  */
 uint_t zfs_dedup_log_flush_flow_rate_txgs = 10;
 
 static const ddt_ops_t *const ddt_ops[DDT_TYPES] = {
 	&ddt_zap_ops,
 };
 
 static const char *const ddt_class_name[DDT_CLASSES] = {
 	"ditto",
 	"duplicate",
 	"unique",
 };
 
 /*
  * DDT feature flags automatically enabled for each on-disk version. Note that
  * versions >0 cannot exist on disk without SPA_FEATURE_FAST_DEDUP enabled.
  */
 static const uint64_t ddt_version_flags[] = {
 	[DDT_VERSION_LEGACY] = 0,
 	[DDT_VERSION_FDT] = DDT_FLAG_FLAT | DDT_FLAG_LOG,
 };
 
 /* per-DDT kstats */
 typedef struct {
 	/* total lookups and whether they returned new or existing entries */
 	kstat_named_t dds_lookup;
 	kstat_named_t dds_lookup_new;
 	kstat_named_t dds_lookup_existing;
 
 	/* entries found on live tree, and if we had to wait for load */
 	kstat_named_t dds_lookup_live_hit;
 	kstat_named_t dds_lookup_live_wait;
 	kstat_named_t dds_lookup_live_miss;
 
 	/* entries found on log trees */
 	kstat_named_t dds_lookup_log_hit;
 	kstat_named_t dds_lookup_log_active_hit;
 	kstat_named_t dds_lookup_log_flushing_hit;
 	kstat_named_t dds_lookup_log_miss;
 
 	/* entries found on store objects */
 	kstat_named_t dds_lookup_stored_hit;
 	kstat_named_t dds_lookup_stored_miss;
 
 	/* number of entries on log trees */
 	kstat_named_t dds_log_active_entries;
 	kstat_named_t dds_log_flushing_entries;
 
 	/* avg updated/flushed entries per txg */
 	kstat_named_t dds_log_ingest_rate;
 	kstat_named_t dds_log_flush_rate;
 	kstat_named_t dds_log_flush_time_rate;
 } ddt_kstats_t;
 
 static const ddt_kstats_t ddt_kstats_template = {
 	{ "lookup",			KSTAT_DATA_UINT64 },
 	{ "lookup_new",			KSTAT_DATA_UINT64 },
 	{ "lookup_existing",		KSTAT_DATA_UINT64 },
 	{ "lookup_live_hit",		KSTAT_DATA_UINT64 },
 	{ "lookup_live_wait",		KSTAT_DATA_UINT64 },
 	{ "lookup_live_miss",		KSTAT_DATA_UINT64 },
 	{ "lookup_log_hit",		KSTAT_DATA_UINT64 },
 	{ "lookup_log_active_hit",	KSTAT_DATA_UINT64 },
 	{ "lookup_log_flushing_hit",	KSTAT_DATA_UINT64 },
 	{ "lookup_log_miss",		KSTAT_DATA_UINT64 },
 	{ "lookup_stored_hit",		KSTAT_DATA_UINT64 },
 	{ "lookup_stored_miss",		KSTAT_DATA_UINT64 },
 	{ "log_active_entries",		KSTAT_DATA_UINT64 },
 	{ "log_flushing_entries",	KSTAT_DATA_UINT64 },
 	{ "log_ingest_rate",		KSTAT_DATA_UINT32 },
 	{ "log_flush_rate",		KSTAT_DATA_UINT32 },
 	{ "log_flush_time_rate",	KSTAT_DATA_UINT32 },
 };
 
 #ifdef _KERNEL
 #define	_DDT_KSTAT_STAT(ddt, stat) \
 	&((ddt_kstats_t *)(ddt)->ddt_ksp->ks_data)->stat.value.ui64
 #define	DDT_KSTAT_BUMP(ddt, stat) \
 	do { atomic_inc_64(_DDT_KSTAT_STAT(ddt, stat)); } while (0)
 #define	DDT_KSTAT_ADD(ddt, stat, val) \
 	do { atomic_add_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
 #define	DDT_KSTAT_SUB(ddt, stat, val) \
 	do { atomic_sub_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
 #define	DDT_KSTAT_SET(ddt, stat, val) \
 	do { atomic_store_64(_DDT_KSTAT_STAT(ddt, stat), val); } while (0)
 #define	DDT_KSTAT_ZERO(ddt, stat) DDT_KSTAT_SET(ddt, stat, 0)
 #else
 #define	DDT_KSTAT_BUMP(ddt, stat) do {} while (0)
 #define	DDT_KSTAT_ADD(ddt, stat, val) do {} while (0)
 #define	DDT_KSTAT_SUB(ddt, stat, val) do {} while (0)
 #define	DDT_KSTAT_SET(ddt, stat, val) do {} while (0)
 #define	DDT_KSTAT_ZERO(ddt, stat) do {} while (0)
 #endif /* _KERNEL */
 
 
 static void
 ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
 	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
 	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
 	ASSERT3U(ddt->ddt_dir_object, >, 0);
 
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT0(*objectp);
 	VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash));
 	ASSERT3U(*objectp, !=, 0);
 
 	ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
 
 	VERIFY0(zap_add(os, ddt->ddt_dir_object, name, sizeof (uint64_t), 1,
 	    objectp, tx));
 
 	VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class], tx));
 }
 
 static void
 ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
 	uint64_t count;
 	char name[DDT_NAMELEN];
 
 	ASSERT3U(ddt->ddt_dir_object, >, 0);
 
 	ddt_object_name(ddt, type, class, name);
 
 	ASSERT3U(*objectp, !=, 0);
 	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
 	VERIFY0(ddt_object_count(ddt, type, class, &count));
 	VERIFY0(count);
 	VERIFY0(zap_remove(os, ddt->ddt_dir_object, name, tx));
 	VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx));
 	VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx));
 	memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t));
 
 	*objectp = 0;
 }
 
 static int
 ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 	dmu_object_info_t doi;
 	uint64_t count;
 	char name[DDT_NAMELEN];
 	int error;
 
 	if (ddt->ddt_dir_object == 0) {
 		/*
 		 * If we're configured but the containing dir doesn't exist
 		 * yet, then this object can't possibly exist either.
 		 */
 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
 		return (SET_ERROR(ENOENT));
 	}
 
 	ddt_object_name(ddt, type, class, name);
 
 	error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
 	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
 	if (error != 0)
 		return (error);
 
 	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class]);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Seed the cached statistics.
 	 */
 	error = ddt_object_info(ddt, type, class, &doi);
 	if (error)
 		return (error);
 
 	error = ddt_object_count(ddt, type, class, &count);
 	if (error)
 		return (error);
 
 	ddo->ddo_count = count;
 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 
 	return (0);
 }
 
 static void
 ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_tx_t *tx)
 {
 	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
 	dmu_object_info_t doi;
 	uint64_t count;
 	char name[DDT_NAMELEN];
 
 	ddt_object_name(ddt, type, class, name);
 
 	VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
 	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
 	    &ddt->ddt_histogram[type][class], tx));
 
 	/*
 	 * Cache DDT statistics; this is the only time they'll change.
 	 */
 	VERIFY0(ddt_object_info(ddt, type, class, &doi));
 	VERIFY0(ddt_object_count(ddt, type, class, &count));
 
 	ddo->ddo_count = count;
 	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
 	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
 }
 
 static boolean_t
 ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	return (!!ddt->ddt_object[type][class]);
 }
 
 static int
 ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     ddt_entry_t *dde)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &dde->dde_key,
 	    dde->dde_phys, DDT_PHYS_SIZE(ddt)));
 }
 
 static int
 ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_key_t *ddk)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os,
 	    ddt->ddt_object[type][class], ddk));
 }
 
 static void
 ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_key_t *ddk)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return;
 
 	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
 	    ddt->ddt_object[type][class], ddk);
 }
 
 static void
 ddt_object_prefetch_all(ddt_t *ddt, ddt_type_t type, ddt_class_t class)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return;
 
 	ddt_ops[type]->ddt_op_prefetch_all(ddt->ddt_os,
 	    ddt->ddt_object[type][class]);
 }
 
 static int
 ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
 	    ddt->ddt_object[type][class], &ddlwe->ddlwe_key,
 	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt), tx));
 }
 
 static int
 ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     const ddt_key_t *ddk, dmu_tx_t *tx)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
 	    ddt->ddt_object[type][class], ddk, tx));
 }
 
 int
 ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     uint64_t *walk, ddt_lightweight_entry_t *ddlwe)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	int error = ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
 	    ddt->ddt_object[type][class], walk, &ddlwe->ddlwe_key,
 	    &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
 	if (error == 0) {
 		ddlwe->ddlwe_type = type;
 		ddlwe->ddlwe_class = class;
 		return (0);
 	}
 	return (error);
 }
 
 int
 ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     uint64_t *count)
 {
 	ASSERT(ddt_object_exists(ddt, type, class));
 
 	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
 	    ddt->ddt_object[type][class], count));
 }
 
 int
 ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     dmu_object_info_t *doi)
 {
 	if (!ddt_object_exists(ddt, type, class))
 		return (SET_ERROR(ENOENT));
 
 	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
 	    doi));
 }
 
 void
 ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class,
     char *name)
 {
 	(void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT,
 	    zio_checksum_table[ddt->ddt_checksum].ci_name,
 	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
 }
 
 void
 ddt_bp_fill(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
     blkptr_t *bp, uint64_t txg)
 {
 	ASSERT3U(txg, !=, 0);
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 	uint64_t phys_birth;
 	const dva_t *dvap;
 
 	if (v == DDT_PHYS_FLAT) {
 		phys_birth = ddp->ddp_flat.ddp_phys_birth;
 		dvap = ddp->ddp_flat.ddp_dva;
 	} else {
 		phys_birth = ddp->ddp_trad[v].ddp_phys_birth;
 		dvap = ddp->ddp_trad[v].ddp_dva;
 	}
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		bp->blk_dva[d] = dvap[d];
 	BP_SET_BIRTH(bp, txg, phys_birth);
 }
 
 /*
  * The bp created via this function may be used for repairs and scrub, but it
  * will be missing the salt / IV required to do a full decrypting read.
  */
 void
 ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
     const ddt_univ_phys_t *ddp, ddt_phys_variant_t v, blkptr_t *bp)
 {
 	BP_ZERO(bp);
 
 	if (ddp != NULL)
 		ddt_bp_fill(ddp, v, bp, ddt_phys_birth(ddp, v));
 
 	bp->blk_cksum = ddk->ddk_cksum;
 
 	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
 	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
 	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
 	BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk));
 	BP_SET_FILL(bp, 1);
 	BP_SET_CHECKSUM(bp, checksum);
 	BP_SET_TYPE(bp, DMU_OT_DEDUP);
 	BP_SET_LEVEL(bp, 0);
 	BP_SET_DEDUP(bp, 1);
 	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 }
 
 void
 ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
 {
 	ddk->ddk_cksum = bp->blk_cksum;
 	ddk->ddk_prop = 0;
 
 	ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp));
 
 	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
 	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
 	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
 	DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp));
 }
 
 void
 ddt_phys_extend(ddt_univ_phys_t *ddp, ddt_phys_variant_t v, const blkptr_t *bp)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 	int bp_ndvas = BP_GET_NDVAS(bp);
 	int ddp_max_dvas = BP_IS_ENCRYPTED(bp) ?
 	    SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP;
 	dva_t *dvas = (v == DDT_PHYS_FLAT) ?
 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
 
 	int s = 0, d = 0;
 	while (s < bp_ndvas && d < ddp_max_dvas) {
 		if (DVA_IS_VALID(&dvas[d])) {
 			d++;
 			continue;
 		}
 		dvas[d] = bp->blk_dva[s];
 		s++; d++;
 	}
 
 	/*
 	 * If the caller offered us more DVAs than we can fit, something has
 	 * gone wrong in their accounting. zio_ddt_write() should never ask for
 	 * more than we need.
 	 */
 	ASSERT3U(s, ==, bp_ndvas);
 
 	if (BP_IS_ENCRYPTED(bp))
 		dvas[2] = bp->blk_dva[2];
 
 	if (ddt_phys_birth(ddp, v) == 0) {
 		if (v == DDT_PHYS_FLAT) {
 			ddp->ddp_flat.ddp_phys_birth =
 			    BP_GET_PHYSICAL_BIRTH(bp);
 		} else {
 			ddp->ddp_trad[v].ddp_phys_birth =
 			    BP_GET_PHYSICAL_BIRTH(bp);
 		}
 	}
 }
 
 void
 ddt_phys_unextend(ddt_univ_phys_t *cur, ddt_univ_phys_t *orig,
     ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 	dva_t *cur_dvas = (v == DDT_PHYS_FLAT) ?
 	    cur->ddp_flat.ddp_dva : cur->ddp_trad[v].ddp_dva;
 	dva_t *orig_dvas = (v == DDT_PHYS_FLAT) ?
 	    orig->ddp_flat.ddp_dva : orig->ddp_trad[v].ddp_dva;
 
 	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
 		cur_dvas[d] = orig_dvas[d];
 
 	if (ddt_phys_birth(orig, v) == 0) {
 		if (v == DDT_PHYS_FLAT)
 			cur->ddp_flat.ddp_phys_birth = 0;
 		else
 			cur->ddp_trad[v].ddp_phys_birth = 0;
 	}
 }
 
 void
 ddt_phys_copy(ddt_univ_phys_t *dst, const ddt_univ_phys_t *src,
     ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	if (v == DDT_PHYS_FLAT)
 		dst->ddp_flat = src->ddp_flat;
 	else
 		dst->ddp_trad[v] = src->ddp_trad[v];
 }
 
 void
 ddt_phys_clear(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	if (v == DDT_PHYS_FLAT)
 		memset(&ddp->ddp_flat, 0, DDT_FLAT_PHYS_SIZE);
 	else
 		memset(&ddp->ddp_trad[v], 0, DDT_TRAD_PHYS_SIZE / DDT_PHYS_MAX);
 }
 
 static uint64_t
 ddt_class_start(void)
 {
 	uint64_t start = gethrestime_sec();
 
 	if (ddt_prune_artificial_age) {
 		/*
 		 * debug aide -- simulate a wider distribution
 		 * so we don't have to wait for an aged DDT
 		 * to test prune.
 		 */
 		int range = 1 << 21;
 		int percent = random_in_range(100);
 		if (percent < 50) {
 			range = range >> 4;
 		} else if (percent > 75) {
 			range /= 2;
 		}
 		start -= random_in_range(range);
 	}
 
 	return (start);
 }
 
 void
 ddt_phys_addref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	if (v == DDT_PHYS_FLAT)
 		ddp->ddp_flat.ddp_refcnt++;
 	else
 		ddp->ddp_trad[v].ddp_refcnt++;
 }
 
 uint64_t
 ddt_phys_decref(ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	uint64_t *refcntp;
 
 	if (v == DDT_PHYS_FLAT)
 		refcntp = &ddp->ddp_flat.ddp_refcnt;
 	else
 		refcntp = &ddp->ddp_trad[v].ddp_refcnt;
 
 	ASSERT3U(*refcntp, >, 0);
 	(*refcntp)--;
 	return (*refcntp);
 }
 
 static void
 ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_univ_phys_t *ddp,
     ddt_phys_variant_t v, uint64_t txg)
 {
 	blkptr_t blk;
 
 	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 
 	/*
 	 * We clear the dedup bit so that zio_free() will actually free the
 	 * space, rather than just decrementing the refcount in the DDT.
 	 */
 	BP_SET_DEDUP(&blk, 0);
 
 	ddt_phys_clear(ddp, v);
 	zio_free(ddt->ddt_spa, txg, &blk);
 }
 
 uint64_t
 ddt_phys_birth(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	if (v == DDT_PHYS_FLAT)
 		return (ddp->ddp_flat.ddp_phys_birth);
 	else
 		return (ddp->ddp_trad[v].ddp_phys_birth);
 }
 
 int
 ddt_phys_is_gang(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
 
 	return (DVA_GET_GANG(&dvas[0]));
 }
 
 int
 ddt_phys_dva_count(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v,
     boolean_t encrypted)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	const dva_t *dvas = (v == DDT_PHYS_FLAT) ?
 	    ddp->ddp_flat.ddp_dva : ddp->ddp_trad[v].ddp_dva;
 
 	return (DVA_IS_VALID(&dvas[0]) +
 	    DVA_IS_VALID(&dvas[1]) +
 	    DVA_IS_VALID(&dvas[2]) * !encrypted);
 }
 
 ddt_phys_variant_t
 ddt_phys_select(const ddt_t *ddt, const ddt_entry_t *dde, const blkptr_t *bp)
 {
 	if (dde == NULL)
 		return (DDT_PHYS_NONE);
 
 	const ddt_univ_phys_t *ddp = dde->dde_phys;
 
 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
 		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_flat.ddp_dva[0]) &&
 		    BP_GET_PHYSICAL_BIRTH(bp) == ddp->ddp_flat.ddp_phys_birth) {
 			return (DDT_PHYS_FLAT);
 		}
 	} else /* traditional phys */ {
 		for (int p = 0; p < DDT_PHYS_MAX; p++) {
 			if (DVA_EQUAL(BP_IDENTITY(bp),
 			    &ddp->ddp_trad[p].ddp_dva[0]) &&
 			    BP_GET_PHYSICAL_BIRTH(bp) ==
 			    ddp->ddp_trad[p].ddp_phys_birth) {
 				return (p);
 			}
 		}
 	}
 	return (DDT_PHYS_NONE);
 }
 
 uint64_t
 ddt_phys_refcnt(const ddt_univ_phys_t *ddp, ddt_phys_variant_t v)
 {
 	ASSERT3U(v, <, DDT_PHYS_NONE);
 
 	if (v == DDT_PHYS_FLAT)
 		return (ddp->ddp_flat.ddp_refcnt);
 	else
 		return (ddp->ddp_trad[v].ddp_refcnt);
 }
 
 uint64_t
 ddt_phys_total_refcnt(const ddt_t *ddt, const ddt_univ_phys_t *ddp)
 {
 	uint64_t refcnt = 0;
 
 	if (ddt->ddt_flags & DDT_FLAG_FLAT)
 		refcnt = ddp->ddp_flat.ddp_refcnt;
 	else
 		for (int v = DDT_PHYS_SINGLE; v <= DDT_PHYS_TRIPLE; v++)
 			refcnt += ddp->ddp_trad[v].ddp_refcnt;
 
 	return (refcnt);
 }
 
 ddt_t *
 ddt_select(spa_t *spa, const blkptr_t *bp)
 {
 	ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp)));
 	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
 }
 
 void
 ddt_enter(ddt_t *ddt)
 {
 	mutex_enter(&ddt->ddt_lock);
 }
 
 void
 ddt_exit(ddt_t *ddt)
 {
 	mutex_exit(&ddt->ddt_lock);
 }
 
 void
 ddt_init(void)
 {
 	ddt_cache = kmem_cache_create("ddt_cache",
 	    sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_flat_cache = kmem_cache_create("ddt_entry_flat_cache",
 	    DDT_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_entry_trad_cache = kmem_cache_create("ddt_entry_trad_cache",
 	    DDT_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	ddt_log_init();
 }
 
 void
 ddt_fini(void)
 {
 	ddt_log_fini();
 
 	kmem_cache_destroy(ddt_entry_trad_cache);
 	kmem_cache_destroy(ddt_entry_flat_cache);
 	kmem_cache_destroy(ddt_cache);
 }
 
 static ddt_entry_t *
 ddt_alloc(const ddt_t *ddt, const ddt_key_t *ddk)
 {
 	ddt_entry_t *dde;
 
 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
 		dde = kmem_cache_alloc(ddt_entry_flat_cache, KM_SLEEP);
 		memset(dde, 0, DDT_ENTRY_FLAT_SIZE);
 	} else {
 		dde = kmem_cache_alloc(ddt_entry_trad_cache, KM_SLEEP);
 		memset(dde, 0, DDT_ENTRY_TRAD_SIZE);
 	}
 
 	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
 
 	dde->dde_key = *ddk;
 
 	return (dde);
 }
 
 void
 ddt_alloc_entry_io(ddt_entry_t *dde)
 {
 	if (dde->dde_io != NULL)
 		return;
 
 	dde->dde_io = kmem_zalloc(sizeof (ddt_entry_io_t), KM_SLEEP);
 }
 
 static void
 ddt_free(const ddt_t *ddt, ddt_entry_t *dde)
 {
 	if (dde->dde_io != NULL) {
 		for (int p = 0; p < DDT_NPHYS(ddt); p++)
 			ASSERT0P(dde->dde_io->dde_lead_zio[p]);
 
 		if (dde->dde_io->dde_repair_abd != NULL)
 			abd_free(dde->dde_io->dde_repair_abd);
 
 		kmem_free(dde->dde_io, sizeof (ddt_entry_io_t));
 	}
 
 	cv_destroy(&dde->dde_cv);
 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
 	    ddt_entry_flat_cache : ddt_entry_trad_cache, dde);
 }
 
 void
 ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
 {
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
 	/* Entry is still in the log, so charge the entry back to it */
 	if (dde->dde_flags & DDE_FLAG_LOGGED) {
 		ddt_lightweight_entry_t ddlwe;
 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
 	}
 
 	avl_remove(&ddt->ddt_tree, dde);
 	ddt_free(ddt, dde);
 }
 
 /*
  * We're considered over quota when we hit 85% full, or for larger drives,
  * when there is less than 8GB free.
  */
 static boolean_t
 ddt_special_over_quota(metaslab_class_t *mc)
 {
 	uint64_t allocated = metaslab_class_get_alloc(mc);
 	uint64_t capacity = metaslab_class_get_space(mc);
 	uint64_t limit = MAX(capacity * 85 / 100,
 	    (capacity > (1LL<<33)) ? capacity - (1LL<<33) : 0);
 	return (allocated >= limit);
 }
 
 /*
  * Check if the DDT is over its quota.  This can be due to a few conditions:
  *   1. 'dedup_table_quota' property is not 0 (none) and the dedup dsize
  *       exceeds this limit
  *
  *   2. 'dedup_table_quota' property is set to automatic and
  *      a. the dedup or special allocation class could not satisfy a DDT
  *         allocation in a recent transaction
  *      b. the dedup or special allocation class has exceeded its 85% limit
  */
 static boolean_t
 ddt_over_quota(spa_t *spa)
 {
 	if (spa->spa_dedup_table_quota == 0)
 		return (B_FALSE);
 
 	if (spa->spa_dedup_table_quota != UINT64_MAX)
 		return (ddt_get_ddt_dsize(spa) > spa->spa_dedup_table_quota);
 
 	/*
 	 * Over quota if have to allocate outside of the dedup/special class.
 	 */
 	if (spa_syncing_txg(spa) <= spa->spa_dedup_class_full_txg +
 	    dedup_class_wait_txgs) {
 		/* Waiting for some deferred frees to be processed */
 		return (B_TRUE);
 	}
 
 	/*
 	 * For automatic quota, table size is limited by dedup or special class
 	 */
 	if (spa_has_dedup(spa))
 		return (ddt_special_over_quota(spa_dedup_class(spa)));
 	else if (spa_special_has_ddt(spa))
 		return (ddt_special_over_quota(spa_special_class(spa)));
 
 	return (B_FALSE);
 }
 
 void
 ddt_prefetch_all(spa_t *spa)
 {
 	/*
 	 * Load all DDT entries for each type/class combination. This is
 	 * indended to perform a prefetch on all such blocks. For the same
 	 * reason that ddt_prefetch isn't locked, this is also not locked.
 	 */
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (!ddt)
 			continue;
 
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				ddt_object_prefetch_all(ddt, type, class);
 			}
 		}
 	}
 }
 
 static int ddt_configure(ddt_t *ddt, boolean_t new);
 
 /*
  * If the BP passed to ddt_lookup has valid DVAs, then we need to compare them
  * to the ones in the entry. If they're different, then the passed-in BP is
  * from a previous generation of this entry (ie was previously pruned) and we
  * have to act like the entry doesn't exist at all.
  *
  * This should only happen during a lookup to free the block (zio_ddt_free()).
  *
  * XXX this is similar in spirit to ddt_phys_select(), maybe can combine
  *       -- robn, 2024-02-09
  */
 static boolean_t
 ddt_entry_lookup_is_valid(ddt_t *ddt, const blkptr_t *bp, ddt_entry_t *dde)
 {
 	/* If the BP has no DVAs, then this entry is good */
 	uint_t ndvas = BP_GET_NDVAS(bp);
 	if (ndvas == 0)
 		return (B_TRUE);
 
 	/*
 	 * Only checking the phys for the copies. For flat, there's only one;
 	 * for trad it'll be the one that has the matching set of DVAs.
 	 */
 	const dva_t *dvas = (ddt->ddt_flags & DDT_FLAG_FLAT) ?
 	    dde->dde_phys->ddp_flat.ddp_dva :
 	    dde->dde_phys->ddp_trad[ndvas].ddp_dva;
 
 	/*
 	 * Compare entry DVAs with the BP. They should all be there, but
 	 * there's not really anything we can do if its only partial anyway,
 	 * that's an error somewhere else, maybe long ago.
 	 */
 	uint_t d;
 	for (d = 0; d < ndvas; d++)
 		if (!DVA_EQUAL(&dvas[d], &bp->blk_dva[d]))
 			return (B_FALSE);
 	ASSERT3U(d, ==, ndvas);
 
 	return (B_TRUE);
 }
 
 ddt_entry_t *
 ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t verify)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ddt_key_t search;
 	ddt_entry_t *dde;
 	ddt_type_t type;
 	ddt_class_t class;
 	avl_index_t where;
 	int error;
 
 	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
 
 	if (ddt->ddt_version == DDT_VERSION_UNCONFIGURED) {
 		/*
 		 * This is the first use of this DDT since the pool was
 		 * created; finish getting it ready for use.
 		 */
 		VERIFY0(ddt_configure(ddt, B_TRUE));
 		ASSERT3U(ddt->ddt_version, !=, DDT_VERSION_UNCONFIGURED);
 	}
 
 	DDT_KSTAT_BUMP(ddt, dds_lookup);
 
 	ddt_key_fill(&search, bp);
 
 	/* Find an existing live entry */
 	dde = avl_find(&ddt->ddt_tree, &search, &where);
 	if (dde != NULL) {
 		/* If we went over quota, act like we didn't find it */
 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA)
 			return (NULL);
 
 		/* If it's already loaded, we can just return it. */
 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_hit);
 		if (dde->dde_flags & DDE_FLAG_LOADED) {
 			if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
 				return (dde);
 			return (NULL);
 		}
 
 		/* Someone else is loading it, wait for it. */
 		dde->dde_waiters++;
 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_wait);
 		while (!(dde->dde_flags & DDE_FLAG_LOADED))
 			cv_wait(&dde->dde_cv, &ddt->ddt_lock);
 		dde->dde_waiters--;
 
 		/* Loaded but over quota, forget we were ever here */
 		if (dde->dde_flags & DDE_FLAG_OVERQUOTA) {
 			if (dde->dde_waiters == 0) {
 				avl_remove(&ddt->ddt_tree, dde);
 				ddt_free(ddt, dde);
 			}
 			return (NULL);
 		}
 
 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
 
 		/* Make sure the loaded entry matches the BP */
 		if (!verify || ddt_entry_lookup_is_valid(ddt, bp, dde))
 			return (dde);
 		return (NULL);
 	} else
 		DDT_KSTAT_BUMP(ddt, dds_lookup_live_miss);
 
 	/* Time to make a new entry. */
 	dde = ddt_alloc(ddt, &search);
 
 	/* Record the time this class was created (used by ddt prune) */
 	if (ddt->ddt_flags & DDT_FLAG_FLAT)
 		dde->dde_phys->ddp_flat.ddp_class_start = ddt_class_start();
 
 	avl_insert(&ddt->ddt_tree, dde, where);
 
 	/* If its in the log tree, we can "load" it from there */
 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
 		ddt_lightweight_entry_t ddlwe;
 
 		if (ddt_log_find_key(ddt, &search, &ddlwe)) {
 			/*
 			 * See if we have the key first, and if so, set up
 			 * the entry.
 			 */
 			dde->dde_type = ddlwe.ddlwe_type;
 			dde->dde_class = ddlwe.ddlwe_class;
 			memcpy(dde->dde_phys, &ddlwe.ddlwe_phys,
 			    DDT_PHYS_SIZE(ddt));
 			/* Whatever we found isn't valid for this BP, eject */
 			if (verify &&
 			    !ddt_entry_lookup_is_valid(ddt, bp, dde)) {
 				avl_remove(&ddt->ddt_tree, dde);
 				ddt_free(ddt, dde);
 				return (NULL);
 			}
 
 			/* Remove it and count it */
 			if (ddt_log_remove_key(ddt,
 			    ddt->ddt_log_active, &search)) {
 				DDT_KSTAT_BUMP(ddt, dds_lookup_log_active_hit);
 			} else {
 				VERIFY(ddt_log_remove_key(ddt,
 				    ddt->ddt_log_flushing, &search));
 				DDT_KSTAT_BUMP(ddt,
 				    dds_lookup_log_flushing_hit);
 			}
 
 			dde->dde_flags = DDE_FLAG_LOADED | DDE_FLAG_LOGGED;
 
 			DDT_KSTAT_BUMP(ddt, dds_lookup_log_hit);
 			DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
 
 			return (dde);
 		}
 
 		DDT_KSTAT_BUMP(ddt, dds_lookup_log_miss);
 	}
 
 	/*
 	 * ddt_tree is now stable, so unlock and let everyone else keep moving.
 	 * Anyone landing on this entry will find it without DDE_FLAG_LOADED,
 	 * and go to sleep waiting for it above.
 	 */
 	ddt_exit(ddt);
 
 	/* Search all store objects for the entry. */
 	error = ENOENT;
 	for (type = 0; type < DDT_TYPES; type++) {
 		for (class = 0; class < DDT_CLASSES; class++) {
 			error = ddt_object_lookup(ddt, type, class, dde);
 			if (error != ENOENT) {
 				ASSERT0(error);
 				break;
 			}
 		}
 		if (error != ENOENT)
 			break;
 	}
 
 	ddt_enter(ddt);
 
 	ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED));
 
 	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
 	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
 
 	boolean_t valid = B_TRUE;
 
 	if (dde->dde_type == DDT_TYPES &&
 	    dde->dde_class == DDT_CLASSES &&
 	    ddt_over_quota(spa)) {
 		/* Over quota. If no one is waiting, clean up right now. */
 		if (dde->dde_waiters == 0) {
 			avl_remove(&ddt->ddt_tree, dde);
 			ddt_free(ddt, dde);
 			return (NULL);
 		}
 
 		/* Flag cleanup required */
 		dde->dde_flags |= DDE_FLAG_OVERQUOTA;
 	} else if (error == 0) {
 		/*
 		 * If what we loaded is no good for this BP and there's no one
 		 * waiting for it, we can just remove it and get out. If its no
 		 * good but there are waiters, we have to leave it, because we
 		 * don't know what they want. If its not needed we'll end up
 		 * taking an entry log/sync, but it can only happen if more
 		 * than one previous version of this block is being deleted at
 		 * the same time. This is extremely unlikely to happen and not
 		 * worth the effort to deal with without taking an entry
 		 * update.
 		 */
 		valid = !verify || ddt_entry_lookup_is_valid(ddt, bp, dde);
 		if (!valid && dde->dde_waiters == 0) {
 			avl_remove(&ddt->ddt_tree, dde);
 			ddt_free(ddt, dde);
 			return (NULL);
 		}
 
 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_hit);
 		DDT_KSTAT_BUMP(ddt, dds_lookup_existing);
 
 		/*
 		 * The histograms only track inactive (stored or logged) blocks.
 		 * We've just put an entry onto the live list, so we need to
 		 * remove its counts. When its synced back, it'll be re-added
 		 * to the right one.
 		 *
 		 * We only do this when we successfully found it in the store.
 		 * error == ENOENT means this is a new entry, and so its already
 		 * not counted.
 		 */
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
 
 		ddt_lightweight_entry_t ddlwe;
 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 		ddt_histogram_sub_entry(ddt, ddh, &ddlwe);
 	} else {
 		DDT_KSTAT_BUMP(ddt, dds_lookup_stored_miss);
 		DDT_KSTAT_BUMP(ddt, dds_lookup_new);
 	}
 
 	/* Entry loaded, everyone can proceed now */
 	dde->dde_flags |= DDE_FLAG_LOADED;
 	cv_broadcast(&dde->dde_cv);
 
 	if ((dde->dde_flags & DDE_FLAG_OVERQUOTA) || !valid)
 		return (NULL);
 
 	return (dde);
 }
 
 void
 ddt_prefetch(spa_t *spa, const blkptr_t *bp)
 {
 	ddt_t *ddt;
 	ddt_key_t ddk;
 
 	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
 		return;
 
 	/*
 	 * We only remove the DDT once all tables are empty and only
 	 * prefetch dedup blocks when there are entries in the DDT.
 	 * Thus no locking is required as the DDT can't disappear on us.
 	 */
 	ddt = ddt_select(spa, bp);
 	ddt_key_fill(&ddk, bp);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			ddt_object_prefetch(ddt, type, class, &ddk);
 		}
 	}
 }
 
 /*
  * ddt_key_t comparison. Any struct wanting to make use of this function must
  * have the key as the first element. Casts it to N uint64_ts, and checks until
  * we find there's a difference. This is intended to match how ddt_zap.c drives
  * the ZAPs (first uint64_t as the key prehash), which will minimise the number
  * of ZAP blocks touched when flushing logged entries from an AVL walk. This is
  * not an invariant for this function though, should you wish to change it.
  */
 int
 ddt_key_compare(const void *x1, const void *x2)
 {
 	const uint64_t *k1 = (const uint64_t *)x1;
 	const uint64_t *k2 = (const uint64_t *)x2;
 
 	int cmp;
 	for (int i = 0; i < (sizeof (ddt_key_t) / sizeof (uint64_t)); i++)
 		if (likely((cmp = TREE_CMP(k1[i], k2[i])) != 0))
 			return (cmp);
 
 	return (0);
 }
 
 /* Create the containing dir for this DDT and bump the feature count */
 static void
 ddt_create_dir(ddt_t *ddt, dmu_tx_t *tx)
 {
 	ASSERT0(ddt->ddt_dir_object);
 	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
 
 	char name[DDT_NAMELEN];
 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
 
 	ddt->ddt_dir_object = zap_create_link(ddt->ddt_os,
 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, name, tx);
 
 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_VERSION,
 	    sizeof (uint64_t), 1, &ddt->ddt_version, tx));
 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS,
 	    sizeof (uint64_t), 1, &ddt->ddt_flags, tx));
 
 	spa_feature_incr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
 }
 
 /* Destroy the containing dir and deactivate the feature */
 static void
 ddt_destroy_dir(ddt_t *ddt, dmu_tx_t *tx)
 {
 	ASSERT3U(ddt->ddt_dir_object, !=, 0);
 	ASSERT3U(ddt->ddt_dir_object, !=, DMU_POOL_DIRECTORY_OBJECT);
 	ASSERT3U(ddt->ddt_version, ==, DDT_VERSION_FDT);
 
 	char name[DDT_NAMELEN];
 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			ASSERT(!ddt_object_exists(ddt, type, class));
 		}
 	}
 
 	ddt_log_destroy(ddt, tx);
 
 	uint64_t count;
 	ASSERT0(zap_count(ddt->ddt_os, ddt->ddt_dir_object, &count));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object,
 	    DDT_DIR_VERSION));
 	ASSERT0(zap_contains(ddt->ddt_os, ddt->ddt_dir_object, DDT_DIR_FLAGS));
 	ASSERT3U(count, ==, 2);
 
 	VERIFY0(zap_remove(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, tx));
 	VERIFY0(zap_destroy(ddt->ddt_os, ddt->ddt_dir_object, tx));
 
 	ddt->ddt_dir_object = 0;
 
 	spa_feature_decr(ddt->ddt_spa, SPA_FEATURE_FAST_DEDUP, tx);
 }
 
 /*
  * Determine, flags and on-disk layout from what's already stored. If there's
  * nothing stored, then if new is false, returns ENOENT, and if true, selects
  * based on pool config.
  */
 static int
 ddt_configure(ddt_t *ddt, boolean_t new)
 {
 	spa_t *spa = ddt->ddt_spa;
 	char name[DDT_NAMELEN];
 	int error;
 
 	ASSERT3U(spa_load_state(spa), !=, SPA_LOAD_CREATE);
 
 	boolean_t fdt_enabled =
 	    spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP);
 	boolean_t fdt_active =
 	    spa_feature_is_active(spa, SPA_FEATURE_FAST_DEDUP);
 
 	/*
 	 * First, look for the global DDT stats object. If its not there, then
 	 * there's never been a DDT written before ever, and we know we're
 	 * starting from scratch.
 	 */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 	    &spa->spa_ddt_stat_object);
 	if (error != 0) {
 		if (error != ENOENT)
 			return (error);
 		goto not_found;
 	}
 
 	if (fdt_active) {
 		/*
 		 * Now look for a DDT directory. If it exists, then it has
 		 * everything we need.
 		 */
 		snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_DIR,
 		    zio_checksum_table[ddt->ddt_checksum].ci_name);
 
 		error = zap_lookup(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1,
 		    &ddt->ddt_dir_object);
 		if (error == 0) {
 			ASSERT3U(spa->spa_meta_objset, ==, ddt->ddt_os);
 
 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
 			    DDT_DIR_VERSION, sizeof (uint64_t), 1,
 			    &ddt->ddt_version);
 			if (error != 0)
 				return (error);
 
 			error = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object,
 			    DDT_DIR_FLAGS, sizeof (uint64_t), 1,
 			    &ddt->ddt_flags);
 			if (error != 0)
 				return (error);
 
 			if (ddt->ddt_version != DDT_VERSION_FDT) {
 				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
 				    "unknown version %llu", spa_name(spa),
 				    name, (u_longlong_t)ddt->ddt_version);
 				return (SET_ERROR(EINVAL));
 			}
 
 			if ((ddt->ddt_flags & ~DDT_FLAG_MASK) != 0) {
 				zfs_dbgmsg("ddt_configure: spa=%s ddt_dir=%s "
 				    "version=%llu unknown flags %llx",
 				    spa_name(spa), name,
 				    (u_longlong_t)ddt->ddt_flags,
 				    (u_longlong_t)ddt->ddt_version);
 				return (SET_ERROR(EINVAL));
 			}
 
 			return (0);
 		}
 		if (error != ENOENT)
 			return (error);
 	}
 
 	/* Any object in the root indicates a traditional setup. */
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			ddt_object_name(ddt, type, class, name);
 			uint64_t obj;
 			error = zap_lookup(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t),
 			    1, &obj);
 			if (error == ENOENT)
 				continue;
 			if (error != 0)
 				return (error);
 
 			ddt->ddt_version = DDT_VERSION_LEGACY;
 			ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
 			ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
 
 			return (0);
 		}
 	}
 
 not_found:
 	if (!new)
 		return (SET_ERROR(ENOENT));
 
 	/* Nothing on disk, so set up for the best version we can */
 	if (fdt_enabled) {
 		ddt->ddt_version = DDT_VERSION_FDT;
 		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
 		ddt->ddt_dir_object = 0; /* create on first use */
 	} else {
 		ddt->ddt_version = DDT_VERSION_LEGACY;
 		ddt->ddt_flags = ddt_version_flags[ddt->ddt_version];
 		ddt->ddt_dir_object = DMU_POOL_DIRECTORY_OBJECT;
 	}
 
 	return (0);
 }
 
 static void
 ddt_table_alloc_kstats(ddt_t *ddt)
 {
 	char *mod = kmem_asprintf("zfs/%s", spa_name(ddt->ddt_spa));
 	char *name = kmem_asprintf("ddt_stats_%s",
 	    zio_checksum_table[ddt->ddt_checksum].ci_name);
 
 	ddt->ddt_ksp = kstat_create(mod, 0, name, "misc", KSTAT_TYPE_NAMED,
 	    sizeof (ddt_kstats_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 	if (ddt->ddt_ksp != NULL) {
 		ddt_kstats_t *dds = kmem_alloc(sizeof (ddt_kstats_t), KM_SLEEP);
 		memcpy(dds, &ddt_kstats_template, sizeof (ddt_kstats_t));
 		ddt->ddt_ksp->ks_data = dds;
 		kstat_install(ddt->ddt_ksp);
 	}
 
 	kmem_strfree(name);
 	kmem_strfree(mod);
 }
 
 static ddt_t *
 ddt_table_alloc(spa_t *spa, enum zio_checksum c)
 {
 	ddt_t *ddt;
 
 	ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP);
 	memset(ddt, 0, sizeof (ddt_t));
 	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&ddt->ddt_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 	avl_create(&ddt->ddt_repair_tree, ddt_key_compare,
 	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
 
 	ddt->ddt_checksum = c;
 	ddt->ddt_spa = spa;
 	ddt->ddt_os = spa->spa_meta_objset;
 	ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 	ddt->ddt_log_flush_pressure = 10;
 
 	ddt_log_alloc(ddt);
 	ddt_table_alloc_kstats(ddt);
 
 	return (ddt);
 }
 
 static void
 ddt_table_free(ddt_t *ddt)
 {
 	if (ddt->ddt_ksp != NULL) {
 		kmem_free(ddt->ddt_ksp->ks_data, sizeof (ddt_kstats_t));
 		ddt->ddt_ksp->ks_data = NULL;
 		kstat_delete(ddt->ddt_ksp);
 	}
 
 	ddt_log_free(ddt);
 	ASSERT0(avl_numnodes(&ddt->ddt_tree));
 	ASSERT0(avl_numnodes(&ddt->ddt_repair_tree));
 	avl_destroy(&ddt->ddt_tree);
 	avl_destroy(&ddt->ddt_repair_tree);
 	mutex_destroy(&ddt->ddt_lock);
 	kmem_cache_free(ddt_cache, ddt);
 }
 
 void
 ddt_create(spa_t *spa)
 {
 	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		if (DDT_CHECKSUM_VALID(c))
 			spa->spa_ddt[c] = ddt_table_alloc(spa, c);
 	}
 }
 
 int
 ddt_load(spa_t *spa)
 {
 	int error;
 
 	ddt_create(spa);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
 	    &spa->spa_ddt_stat_object);
 	if (error)
 		return (error == ENOENT ? 0 : error);
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		if (!DDT_CHECKSUM_VALID(c))
 			continue;
 
 		ddt_t *ddt = spa->spa_ddt[c];
 		error = ddt_configure(ddt, B_FALSE);
 		if (error == ENOENT)
 			continue;
 		if (error != 0)
 			return (error);
 
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0; class < DDT_CLASSES;
 			    class++) {
 				error = ddt_object_load(ddt, type, class);
 				if (error != 0 && error != ENOENT)
 					return (error);
 			}
 		}
 
-		error = ddt_log_load(ddt);
-		if (error != 0 && error != ENOENT)
-			return (error);
+		if (ddt->ddt_flags & DDT_FLAG_LOG) {
+			error = ddt_log_load(ddt);
+			if (error != 0 && error != ENOENT)
+				return (error);
+		}
 
 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
 		DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
 		    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
 
 		/*
 		 * Seed the cached histograms.
 		 */
 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 		    sizeof (ddt->ddt_histogram));
 	}
 
 	spa->spa_dedup_dspace = ~0ULL;
 	spa->spa_dedup_dsize = ~0ULL;
 
 	return (0);
 }
 
 void
 ddt_unload(spa_t *spa)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		if (spa->spa_ddt[c]) {
 			ddt_table_free(spa->spa_ddt[c]);
 			spa->spa_ddt[c] = NULL;
 		}
 	}
 }
 
 boolean_t
 ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp)
 {
 	ddt_t *ddt;
 	ddt_key_t ddk;
 
 	if (!BP_GET_DEDUP(bp))
 		return (B_FALSE);
 
 	if (max_class == DDT_CLASS_UNIQUE)
 		return (B_TRUE);
 
 	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
 
 	ddt_key_fill(&ddk, bp);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class <= max_class; class++) {
 			if (ddt_object_contains(ddt, type, class, &ddk) == 0)
 				return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 ddt_entry_t *
 ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
 {
 	ddt_key_t ddk;
 	ddt_entry_t *dde;
 
 	ddt_key_fill(&ddk, bp);
 
 	dde = ddt_alloc(ddt, &ddk);
 	ddt_alloc_entry_io(dde);
 
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			/*
 			 * We can only do repair if there are multiple copies
 			 * of the block.  For anything in the UNIQUE class,
 			 * there's definitely only one copy, so don't even try.
 			 */
 			if (class != DDT_CLASS_UNIQUE &&
 			    ddt_object_lookup(ddt, type, class, dde) == 0)
 				return (dde);
 		}
 	}
 
 	memset(dde->dde_phys, 0, DDT_PHYS_SIZE(ddt));
 
 	return (dde);
 }
 
 void
 ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
 {
 	avl_index_t where;
 
 	ddt_enter(ddt);
 
 	if (dde->dde_io->dde_repair_abd != NULL &&
 	    spa_writeable(ddt->ddt_spa) &&
 	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
 		avl_insert(&ddt->ddt_repair_tree, dde, where);
 	else
 		ddt_free(ddt, dde);
 
 	ddt_exit(ddt);
 }
 
 static void
 ddt_repair_entry_done(zio_t *zio)
 {
 	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
 	ddt_entry_t *rdde = zio->io_private;
 
 	ddt_free(ddt, rdde);
 }
 
 static void
 ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
 {
 	ddt_key_t *ddk = &dde->dde_key;
 	ddt_key_t *rddk = &rdde->dde_key;
 	zio_t *zio;
 	blkptr_t blk;
 
 	zio = zio_null(rio, rio->io_spa, NULL,
 	    ddt_repair_entry_done, rdde, rio->io_flags);
 
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ddt_univ_phys_t *ddp = dde->dde_phys;
 		ddt_univ_phys_t *rddp = rdde->dde_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_birth = ddt_phys_birth(ddp, v);
 		const dva_t *dvas, *rdvas;
 
 		if (ddt->ddt_flags & DDT_FLAG_FLAT) {
 			dvas = ddp->ddp_flat.ddp_dva;
 			rdvas = rddp->ddp_flat.ddp_dva;
 		} else {
 			dvas = ddp->ddp_trad[p].ddp_dva;
 			rdvas = rddp->ddp_trad[p].ddp_dva;
 		}
 
 		if (phys_birth == 0 ||
 		    phys_birth != ddt_phys_birth(rddp, v) ||
 		    memcmp(dvas, rdvas, sizeof (dva_t) * SPA_DVAS_PER_BP))
 			continue;
 
 		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, v, &blk);
 		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
 		    rdde->dde_io->dde_repair_abd, DDK_GET_PSIZE(rddk),
 		    NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
 		    ZIO_DDT_CHILD_FLAGS(zio), NULL));
 	}
 
 	zio_nowait(zio);
 }
 
 static void
 ddt_repair_table(ddt_t *ddt, zio_t *rio)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ddt_entry_t *dde, *rdde_next, *rdde;
 	avl_tree_t *t = &ddt->ddt_repair_tree;
 	blkptr_t blk;
 
 	if (spa_sync_pass(spa) > 1)
 		return;
 
 	ddt_enter(ddt);
 	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
 		rdde_next = AVL_NEXT(t, rdde);
 		avl_remove(&ddt->ddt_repair_tree, rdde);
 		ddt_exit(ddt);
 		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL,
 		    DDT_PHYS_NONE, &blk);
 		dde = ddt_repair_start(ddt, &blk);
 		ddt_repair_entry(ddt, dde, rdde, rio);
 		ddt_repair_done(ddt, dde);
 		ddt_enter(ddt);
 	}
 	ddt_exit(ddt);
 }
 
 static void
 ddt_sync_update_stats(ddt_t *ddt, dmu_tx_t *tx)
 {
 	/*
 	 * Count all the entries stored for each type/class, and updates the
 	 * stats within (ddt_object_sync()). If there's no entries for the
 	 * type/class, the whole object is removed. If all objects for the DDT
 	 * are removed, its containing dir is removed, effectively resetting
 	 * the entire DDT to an empty slate.
 	 */
 	uint64_t count = 0;
 	for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 		uint64_t add, tcount = 0;
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (ddt_object_exists(ddt, type, class)) {
 				ddt_object_sync(ddt, type, class, tx);
 				VERIFY0(ddt_object_count(ddt, type, class,
 				    &add));
 				tcount += add;
 			}
 		}
 		for (ddt_class_t class = 0; class < DDT_CLASSES; class++) {
 			if (tcount == 0 && ddt_object_exists(ddt, type, class))
 				ddt_object_destroy(ddt, type, class, tx);
 		}
 		count += tcount;
 	}
 
 	if (ddt->ddt_flags & DDT_FLAG_LOG) {
 		/* Include logged entries in the total count */
 		count += avl_numnodes(&ddt->ddt_log_active->ddl_tree);
 		count += avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
 	}
 
 	if (count == 0) {
 		/*
 		 * No entries left on the DDT, so reset the version for next
 		 * time. This allows us to handle the feature being changed
 		 * since the DDT was originally created. New entries should get
 		 * whatever the feature currently demands.
 		 */
 		if (ddt->ddt_version == DDT_VERSION_FDT)
 			ddt_destroy_dir(ddt, tx);
 
 		ddt->ddt_version = DDT_VERSION_UNCONFIGURED;
 		ddt->ddt_flags = 0;
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
 	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
 	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 }
 
 static void
 ddt_sync_scan_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
 
 	/*
 	 * Compute the target class, so we can decide whether or not to inform
 	 * the scrub traversal (below). Note that we don't store this in the
 	 * entry, as it might change multiple times before finally being
 	 * committed (if we're logging). Instead, we recompute it in
 	 * ddt_sync_entry().
 	 */
 	uint64_t refcnt = ddt_phys_total_refcnt(ddt, &ddlwe->ddlwe_phys);
 	ddt_class_t nclass =
 	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
 
 	/*
 	 * If the class changes, the order that we scan this bp changes. If it
 	 * decreases, we could miss it, so scan it right now. (This covers both
 	 * class changing while we are doing ddt_walk(), and when we are
 	 * traversing.)
 	 *
 	 * We also do this when the refcnt goes to zero, because that change is
 	 * only in the log so far; the blocks on disk won't be freed until
 	 * the log is flushed, and the refcnt might increase before that. If it
 	 * does, then we could miss it in the same way.
 	 */
 	if (refcnt == 0 || nclass < ddlwe->ddlwe_class)
 		dsl_scan_ddt_entry(dp->dp_scan, ddt->ddt_checksum, ddt,
 		    ddlwe, tx);
 }
 
 static void
 ddt_sync_flush_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe,
     ddt_type_t otype, ddt_class_t oclass, dmu_tx_t *tx)
 {
 	ddt_key_t *ddk = &ddlwe->ddlwe_key;
 	ddt_type_t ntype = DDT_TYPE_DEFAULT;
 	uint64_t refcnt = 0;
 
 	/*
 	 * Compute the total refcnt. Along the way, issue frees for any DVAs
 	 * we no longer want.
 	 */
 	for (int p = 0; p < DDT_NPHYS(ddt); p++) {
 		ddt_univ_phys_t *ddp = &ddlwe->ddlwe_phys;
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 		uint64_t phys_refcnt = ddt_phys_refcnt(ddp, v);
 
 		if (ddt_phys_birth(ddp, v) == 0) {
 			ASSERT0(phys_refcnt);
 			continue;
 		}
 		if (DDT_PHYS_IS_DITTO(ddt, p)) {
 			/*
 			 * We don't want to keep any obsolete slots (eg ditto),
 			 * regardless of their refcount, but we don't want to
 			 * leak them either. So, free them.
 			 */
 			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
 			continue;
 		}
 		if (phys_refcnt == 0)
 			/* No remaining references, free it! */
 			ddt_phys_free(ddt, ddk, ddp, v, tx->tx_txg);
 		refcnt += phys_refcnt;
 	}
 
 	/* Select the best class for the entry. */
 	ddt_class_t nclass =
 	    (refcnt > 1) ? DDT_CLASS_DUPLICATE : DDT_CLASS_UNIQUE;
 
 	/*
 	 * If an existing entry changed type or class, or its refcount reached
 	 * zero, delete it from the DDT object
 	 */
 	if (otype != DDT_TYPES &&
 	    (otype != ntype || oclass != nclass || refcnt == 0)) {
 		VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx));
 		ASSERT(ddt_object_contains(ddt, otype, oclass, ddk) == ENOENT);
 	}
 
 	/*
 	 * Add or update the entry
 	 */
 	if (refcnt != 0) {
 		ddt_histogram_t *ddh =
 		    &ddt->ddt_histogram[ntype][nclass];
 
 		ddt_histogram_add_entry(ddt, ddh, ddlwe);
 
 		if (!ddt_object_exists(ddt, ntype, nclass))
 			ddt_object_create(ddt, ntype, nclass, tx);
 		VERIFY0(ddt_object_update(ddt, ntype, nclass, ddlwe, tx));
 	}
 }
 
 /* Calculate an exponential weighted moving average, lower limited to zero */
 static inline int32_t
 _ewma(int32_t val, int32_t prev, uint32_t weight)
 {
 	ASSERT3U(val, >=, 0);
 	ASSERT3U(prev, >=, 0);
 	const int32_t new =
 	    MAX(0, prev + (val-prev) / (int32_t)MAX(weight, 1));
 	ASSERT3U(new, >=, 0);
 	return (new);
 }
 
 static inline void
 ddt_flush_force_update_txg(ddt_t *ddt, uint64_t txg)
 {
 	/*
 	 * If we're not forcing flush, and not being asked to start, then
 	 * there's nothing more to do.
 	 */
 	if (txg == 0) {
 		/* Update requested, are we currently forcing flush? */
 		if (ddt->ddt_flush_force_txg == 0)
 			return;
 		txg = ddt->ddt_flush_force_txg;
 	}
 
 	/*
 	 * If either of the logs have entries unflushed entries before
 	 * the wanted txg, set the force txg, otherwise clear it.
 	 */
 
 	if ((!avl_is_empty(&ddt->ddt_log_active->ddl_tree) &&
 	    ddt->ddt_log_active->ddl_first_txg <= txg) ||
 	    (!avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
 	    ddt->ddt_log_flushing->ddl_first_txg <= txg)) {
 		ddt->ddt_flush_force_txg = txg;
 		return;
 	}
 
 	/*
 	 * Nothing to flush behind the given txg, so we can clear force flush
 	 * state.
 	 */
 	ddt->ddt_flush_force_txg = 0;
 }
 
 static void
 ddt_sync_flush_log(ddt_t *ddt, dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
 	ASSERT(avl_is_empty(&ddt->ddt_tree));
 
 	/*
 	 * Don't do any flushing when the pool is ready to shut down, or in
 	 * passes beyond the first.
 	 */
 	if (spa_sync_pass(spa) > 1 || tx->tx_txg > spa_final_dirty_txg(spa))
 		return;
 
 	hrtime_t flush_start = gethrtime();
 	uint32_t count = 0;
 
 	/*
 	 * How many entries we need to flush. We need to at
 	 * least match the ingest rate, and also consider the
 	 * current backlog of entries.
 	 */
 	uint64_t backlog = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) +
 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree);
 
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree))
 		goto housekeeping;
 
 	uint64_t txgs = MAX(1, zfs_dedup_log_flush_txgs);
 	uint64_t cap = MAX(1, zfs_dedup_log_cap);
 	uint64_t flush_min = MAX(backlog / txgs,
 	    zfs_dedup_log_flush_entries_min);
 
 	/*
 	 * The theory for this block is that if we increase the pressure while
 	 * we're growing above the cap, and remove it when we're significantly
 	 * below the cap, we'll stay near cap while not bouncing around too
 	 * much.
 	 *
 	 * The factor of 10 is to smooth the pressure effect by expressing it
 	 * in tenths. The addition of the cap to the backlog in the second
 	 * block is to round up, instead of down. We never let the pressure go
 	 * below 1 (10 tenths).
 	 */
 	if (cap != UINT_MAX && backlog > cap &&
 	    backlog > ddt->ddt_log_flush_prev_backlog) {
 		ddt->ddt_log_flush_pressure += 10 * backlog / cap;
 	} else if (cap != UINT_MAX && backlog < cap) {
 		ddt->ddt_log_flush_pressure -=
 		    11 - (((10 * backlog) + cap - 1) / cap);
 		ddt->ddt_log_flush_pressure =
 		    MAX(ddt->ddt_log_flush_pressure, 10);
 	}
 
 	if (zfs_dedup_log_hard_cap && cap != UINT_MAX)
 		flush_min = MAX(flush_min, MIN(backlog - cap,
 		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
 
 	uint64_t flush_max;
 
 	/*
 	 * If we've been asked to flush everything in a hurry,
 	 * try to dump as much as possible on this txg. In
 	 * this case we're only limited by time, not amount.
 	 *
 	 * Otherwise, if we are over the cap, try to get back down to it.
 	 *
 	 * Finally if there is no cap (or no pressure), just set the max a
 	 * little higher than the min to help smooth out variations in flush
 	 * times.
 	 */
 	if (ddt->ddt_flush_force_txg > 0)
 		flush_max = avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
 	else if (cap != UINT32_MAX && !zfs_dedup_log_hard_cap)
 		flush_max = MAX(flush_min * 5 / 4, MIN(backlog - cap,
 		    (flush_min * ddt->ddt_log_flush_pressure) / 10));
 	else
 		flush_max = flush_min * 5 / 4;
 	flush_max = MIN(flush_max, zfs_dedup_log_flush_entries_max);
 
 	/*
 	 * When the pool is busy or someone is explicitly waiting for this txg
 	 * to complete, use the zfs_dedup_log_flush_min_time_ms.  Otherwise use
 	 * half of the time in the txg timeout.
 	 */
 	uint64_t target_time;
 
 	if (txg_sync_waiting(ddt->ddt_spa->spa_dsl_pool) ||
 	    vdev_queue_pool_busy(spa)) {
 		target_time = MIN(MSEC2NSEC(zfs_dedup_log_flush_min_time_ms),
 		    SEC2NSEC(zfs_txg_timeout) / 2);
 	} else {
 		target_time = SEC2NSEC(zfs_txg_timeout) / 2;
 	}
 
 	ddt_lightweight_entry_t ddlwe;
 	while (ddt_log_take_first(ddt, ddt->ddt_log_flushing, &ddlwe)) {
 		ddt_sync_flush_entry(ddt, &ddlwe,
 		    ddlwe.ddlwe_type, ddlwe.ddlwe_class, tx);
 
 		/* End if we've synced as much as we needed to. */
 		if (++count >= flush_max)
 			break;
 
 		/*
 		 * As long as we've flushed the absolute minimum,
 		 * stop if we're way over our target time.
 		 */
 		uint64_t diff = gethrtime() - flush_start;
 		if (count > zfs_dedup_log_flush_entries_min &&
 		    diff >= target_time * 2)
 			break;
 
 		/*
 		 * End if we've passed the minimum flush and we're out of time.
 		 */
 		if (count > flush_min && diff >= target_time)
 			break;
 	}
 
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree)) {
 		/* We emptied it, so truncate on-disk */
 		DDT_KSTAT_ZERO(ddt, dds_log_flushing_entries);
 		ddt_log_truncate(ddt, tx);
 	} else {
 		/* More to do next time, save checkpoint */
 		DDT_KSTAT_SUB(ddt, dds_log_flushing_entries, count);
 		ddt_log_checkpoint(ddt, &ddlwe, tx);
 	}
 
 	ddt_sync_update_stats(ddt, tx);
 
 housekeeping:
 	if (avl_is_empty(&ddt->ddt_log_flushing->ddl_tree) &&
 	    !avl_is_empty(&ddt->ddt_log_active->ddl_tree)) {
 		/*
 		 * No more to flush, and the active list has stuff, so
 		 * try to swap the logs for next time.
 		 */
 		if (ddt_log_swap(ddt, tx)) {
 			DDT_KSTAT_ZERO(ddt, dds_log_active_entries);
 			DDT_KSTAT_SET(ddt, dds_log_flushing_entries,
 			    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree));
 		}
 	}
 
 	/* If force flush is no longer necessary, turn it off. */
 	ddt_flush_force_update_txg(ddt, 0);
 
 	ddt->ddt_log_flush_prev_backlog = backlog;
 
 	/*
 	 * Update flush rate. This is an exponential weighted moving
 	 * average of the number of entries flushed over recent txgs.
 	 */
 	ddt->ddt_log_flush_rate = _ewma(count, ddt->ddt_log_flush_rate,
 	    zfs_dedup_log_flush_flow_rate_txgs);
 	DDT_KSTAT_SET(ddt, dds_log_flush_rate, ddt->ddt_log_flush_rate);
 
 	/*
 	 * Update flush time rate. This is an exponential weighted moving
 	 * average of the total time taken to flush over recent txgs.
 	 */
 	ddt->ddt_log_flush_time_rate = _ewma(ddt->ddt_log_flush_time_rate,
 	    (int32_t)NSEC2MSEC(gethrtime() - flush_start),
 	    zfs_dedup_log_flush_flow_rate_txgs);
 	DDT_KSTAT_SET(ddt, dds_log_flush_time_rate,
 	    ddt->ddt_log_flush_time_rate);
 	if (avl_numnodes(&ddt->ddt_log_flushing->ddl_tree) > 0 &&
 	    zfs_flags & ZFS_DEBUG_DDT) {
 		zfs_dbgmsg("%lu entries remain(%lu in active), flushed %u @ "
 		    "txg %llu, in %llu ms, flush rate %d, time rate %d",
 		    (ulong_t)avl_numnodes(&ddt->ddt_log_flushing->ddl_tree),
 		    (ulong_t)avl_numnodes(&ddt->ddt_log_active->ddl_tree),
 		    count, (u_longlong_t)tx->tx_txg,
 		    (u_longlong_t)NSEC2MSEC(gethrtime() - flush_start),
 		    ddt->ddt_log_flush_rate, ddt->ddt_log_flush_time_rate);
 	}
 }
 
 static void
 ddt_sync_table_log(ddt_t *ddt, dmu_tx_t *tx)
 {
 	uint64_t count = avl_numnodes(&ddt->ddt_tree);
 
 	if (count > 0) {
 		ddt_log_update_t dlu = {0};
 		ddt_log_begin(ddt, count, tx, &dlu);
 
 		ddt_entry_t *dde;
 		void *cookie = NULL;
 		ddt_lightweight_entry_t ddlwe;
 		while ((dde =
 		    avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
 			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 			DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 			ddt_log_entry(ddt, &ddlwe, &dlu);
 			ddt_sync_scan_entry(ddt, &ddlwe, tx);
 			ddt_free(ddt, dde);
 		}
 
 		ddt_log_commit(ddt, &dlu);
 
 		DDT_KSTAT_SET(ddt, dds_log_active_entries,
 		    avl_numnodes(&ddt->ddt_log_active->ddl_tree));
 
 		/*
 		 * Sync the stats for the store objects. Even though we haven't
 		 * modified anything on those objects, they're no longer the
 		 * source of truth for entries that are now in the log, and we
 		 * need the on-disk counts to reflect that, otherwise we'll
 		 * miscount later when importing.
 		 */
 		for (ddt_type_t type = 0; type < DDT_TYPES; type++) {
 			for (ddt_class_t class = 0;
 			    class < DDT_CLASSES; class++) {
 				if (ddt_object_exists(ddt, type, class))
 					ddt_object_sync(ddt, type, class, tx);
 			}
 		}
 
 		memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 		    sizeof (ddt->ddt_histogram));
 		ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
 		ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 	}
 
 	if (spa_sync_pass(ddt->ddt_spa) == 1) {
 		/*
 		 * Update ingest rate. This is an exponential weighted moving
 		 * average of the number of entries changed over recent txgs.
 		 * The ramp-up cost shouldn't matter too much because the
 		 * flusher will be trying to take at least the minimum anyway.
 		 */
 		ddt->ddt_log_ingest_rate = _ewma(
 		    count, ddt->ddt_log_ingest_rate,
 		    zfs_dedup_log_flush_flow_rate_txgs);
 		DDT_KSTAT_SET(ddt, dds_log_ingest_rate,
 		    ddt->ddt_log_ingest_rate);
 	}
 }
 
 static void
 ddt_sync_table_flush(ddt_t *ddt, dmu_tx_t *tx)
 {
 	if (avl_numnodes(&ddt->ddt_tree) == 0)
 		return;
 
 	ddt_entry_t *dde;
 	void *cookie = NULL;
 	while ((dde = avl_destroy_nodes(
 	    &ddt->ddt_tree, &cookie)) != NULL) {
 		ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 
 		ddt_lightweight_entry_t ddlwe;
 		DDT_ENTRY_TO_LIGHTWEIGHT(ddt, dde, &ddlwe);
 		ddt_sync_flush_entry(ddt, &ddlwe,
 		    dde->dde_type, dde->dde_class, tx);
 		ddt_sync_scan_entry(ddt, &ddlwe, tx);
 		ddt_free(ddt, dde);
 	}
 
 	memcpy(&ddt->ddt_histogram_cache, ddt->ddt_histogram,
 	    sizeof (ddt->ddt_histogram));
 	ddt->ddt_spa->spa_dedup_dspace = ~0ULL;
 	ddt->ddt_spa->spa_dedup_dsize = ~0ULL;
 	ddt_sync_update_stats(ddt, tx);
 }
 
 static void
 ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx)
 {
 	spa_t *spa = ddt->ddt_spa;
 
 	if (ddt->ddt_version == UINT64_MAX)
 		return;
 
 	if (spa->spa_uberblock.ub_version < SPA_VERSION_DEDUP) {
 		ASSERT0(avl_numnodes(&ddt->ddt_tree));
 		return;
 	}
 
 	if (spa->spa_ddt_stat_object == 0) {
 		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
 		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DDT_STATS, tx);
 	}
 
 	if (ddt->ddt_version == DDT_VERSION_FDT && ddt->ddt_dir_object == 0)
 		ddt_create_dir(ddt, tx);
 
 	if (ddt->ddt_flags & DDT_FLAG_LOG)
 		ddt_sync_table_log(ddt, tx);
 	else
 		ddt_sync_table_flush(ddt, tx);
 }
 
 void
 ddt_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 	dmu_tx_t *tx;
 	zio_t *rio;
 
 	ASSERT3U(spa_syncing_txg(spa), ==, txg);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	rio = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
 
 	/*
 	 * This function may cause an immediate scan of ddt blocks (see
 	 * the comment above dsl_scan_ddt() for details). We set the
 	 * scan's root zio here so that we can wait for any scan IOs in
 	 * addition to the regular ddt IOs.
 	 */
 	ASSERT0P(scn->scn_zio_root);
 	scn->scn_zio_root = rio;
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL)
 			continue;
 		ddt_sync_table(ddt, tx);
 		if (ddt->ddt_flags & DDT_FLAG_LOG)
 			ddt_sync_flush_log(ddt, tx);
 		ddt_repair_table(ddt, rio);
 	}
 
 	(void) zio_wait(rio);
 	scn->scn_zio_root = NULL;
 
 	dmu_tx_commit(tx);
 }
 
 void
 ddt_walk_init(spa_t *spa, uint64_t txg)
 {
 	if (txg == 0)
 		txg = spa_syncing_txg(spa);
 
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
 			continue;
 
 		ddt_enter(ddt);
 		ddt_flush_force_update_txg(ddt, txg);
 		ddt_exit(ddt);
 	}
 }
 
 boolean_t
 ddt_walk_ready(spa_t *spa)
 {
 	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
 		ddt_t *ddt = spa->spa_ddt[c];
 		if (ddt == NULL || !(ddt->ddt_flags & DDT_FLAG_LOG))
 			continue;
 
 		if (ddt->ddt_flush_force_txg > 0)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 static int
 ddt_walk_impl(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe,
     uint64_t flags, boolean_t wait)
 {
 	do {
 		do {
 			do {
 				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
 				if (ddt == NULL)
 					continue;
 
 				if (flags != 0 &&
 				    (ddt->ddt_flags & flags) != flags)
 					continue;
 
 				if (wait && ddt->ddt_flush_force_txg > 0)
 					return (EAGAIN);
 
 				int error = ENOENT;
 				if (ddt_object_exists(ddt, ddb->ddb_type,
 				    ddb->ddb_class)) {
 					error = ddt_object_walk(ddt,
 					    ddb->ddb_type, ddb->ddb_class,
 					    &ddb->ddb_cursor, ddlwe);
 				}
 				if (error == 0)
 					return (0);
 				if (error != ENOENT)
 					return (error);
 				ddb->ddb_cursor = 0;
 			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
 			ddb->ddb_checksum = 0;
 		} while (++ddb->ddb_type < DDT_TYPES);
 		ddb->ddb_type = 0;
 	} while (++ddb->ddb_class < DDT_CLASSES);
 
 	return (SET_ERROR(ENOENT));
 }
 
 int
 ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_lightweight_entry_t *ddlwe)
 {
 	return (ddt_walk_impl(spa, ddb, ddlwe, 0, B_TRUE));
 }
 
 /*
  * This function is used by Block Cloning (brt.c) to increase reference
  * counter for the DDT entry if the block is already in DDT.
  *
  * Return false if the block, despite having the D bit set, is not present
  * in the DDT. This is possible when the DDT has been pruned by an admin
  * or by the DDT quota mechanism.
  */
 boolean_t
 ddt_addref(spa_t *spa, const blkptr_t *bp)
 {
 	ddt_t *ddt;
 	ddt_entry_t *dde;
 	boolean_t result;
 
 	spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
 	ddt = ddt_select(spa, bp);
 	ddt_enter(ddt);
 
 	dde = ddt_lookup(ddt, bp, B_TRUE);
 
 	/* Can be NULL if the entry for this block was pruned. */
 	if (dde == NULL) {
 		ddt_exit(ddt);
 		spa_config_exit(spa, SCL_ZIO, FTAG);
 		return (B_FALSE);
 	}
 
 	if ((dde->dde_type < DDT_TYPES) || (dde->dde_flags & DDE_FLAG_LOGGED)) {
 		/*
 		 * This entry was either synced to a store object (dde_type is
 		 * real) or was logged. It must be properly on disk at this
 		 * point, so we can just bump its refcount.
 		 */
 		int p = DDT_PHYS_FOR_COPIES(ddt, BP_GET_NDVAS(bp));
 		ddt_phys_variant_t v = DDT_PHYS_VARIANT(ddt, p);
 
 		ddt_phys_addref(dde->dde_phys, v);
 		result = B_TRUE;
 	} else {
 		/*
 		 * If the block has the DEDUP flag set it still might not
 		 * exist in the DEDUP table due to DDT pruning of entries
 		 * where refcnt=1.
 		 */
 		ddt_remove(ddt, dde);
 		result = B_FALSE;
 	}
 
 	ddt_exit(ddt);
 	spa_config_exit(spa, SCL_ZIO, FTAG);
 
 	return (result);
 }
 
 typedef struct ddt_prune_entry {
 	ddt_t		*dpe_ddt;
 	ddt_key_t	dpe_key;
 	list_node_t	dpe_node;
 	ddt_univ_phys_t	dpe_phys[];
 } ddt_prune_entry_t;
 
 typedef struct ddt_prune_info {
 	spa_t		*dpi_spa;
 	uint64_t	dpi_txg_syncs;
 	uint64_t	dpi_pruned;
 	list_t		dpi_candidates;
 } ddt_prune_info_t;
 
 /*
  * Add prune candidates for ddt_sync during spa_sync
  */
 static void
 prune_candidates_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) tx;
 	ddt_prune_info_t *dpi = arg;
 	ddt_prune_entry_t *dpe;
 
 	spa_config_enter(dpi->dpi_spa, SCL_ZIO, FTAG, RW_READER);
 
 	/* Process the prune candidates collected so far */
 	while ((dpe = list_remove_head(&dpi->dpi_candidates)) != NULL) {
 		blkptr_t blk;
 		ddt_t *ddt = dpe->dpe_ddt;
 
 		ddt_enter(ddt);
 
 		/*
 		 * If it's on the live list, then it was loaded for update
 		 * this txg and is no longer stale; skip it.
 		 */
 		if (avl_find(&ddt->ddt_tree, &dpe->dpe_key, NULL)) {
 			ddt_exit(ddt);
 			kmem_free(dpe, sizeof (*dpe));
 			continue;
 		}
 
 		ddt_bp_create(ddt->ddt_checksum, &dpe->dpe_key,
 		    dpe->dpe_phys, DDT_PHYS_FLAT, &blk);
 
 		ddt_entry_t *dde = ddt_lookup(ddt, &blk, B_TRUE);
 		if (dde != NULL && !(dde->dde_flags & DDE_FLAG_LOGGED)) {
 			ASSERT(dde->dde_flags & DDE_FLAG_LOADED);
 			/*
 			 * Zero the physical, so we don't try to free DVAs
 			 * at flush nor try to reuse this entry.
 			 */
 			ddt_phys_clear(dde->dde_phys, DDT_PHYS_FLAT);
 
 			dpi->dpi_pruned++;
 		}
 
 		ddt_exit(ddt);
 		kmem_free(dpe, sizeof (*dpe));
 	}
 
 	spa_config_exit(dpi->dpi_spa, SCL_ZIO, FTAG);
 	dpi->dpi_txg_syncs++;
 }
 
 /*
  * Prune candidates are collected in open context and processed
  * in sync context as part of ddt_sync_table().
  */
 static void
 ddt_prune_entry(list_t *list, ddt_t *ddt, const ddt_key_t *ddk,
     const ddt_univ_phys_t *ddp)
 {
 	ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
 
 	size_t dpe_size = sizeof (ddt_prune_entry_t) + DDT_FLAT_PHYS_SIZE;
 	ddt_prune_entry_t *dpe = kmem_alloc(dpe_size, KM_SLEEP);
 
 	dpe->dpe_ddt = ddt;
 	dpe->dpe_key = *ddk;
 	memcpy(dpe->dpe_phys, ddp, DDT_FLAT_PHYS_SIZE);
 	list_insert_head(list, dpe);
 }
 
 /*
  * Interate over all the entries in the DDT unique class.
  * The walk will perform one of the following operations:
  *  (a) build a histogram than can be used when pruning
  *  (b) prune entries older than the cutoff
  *
  *  Also called by zdb(8) to dump the age histogram
  */
 void
 ddt_prune_walk(spa_t *spa, uint64_t cutoff, ddt_age_histo_t *histogram)
 {
 	ddt_bookmark_t ddb = {
 		.ddb_class = DDT_CLASS_UNIQUE,
 		.ddb_type = 0,
 		.ddb_checksum = 0,
 		.ddb_cursor = 0
 	};
 	ddt_lightweight_entry_t ddlwe = {0};
 	int error;
 	int valid = 0;
 	int candidates = 0;
 	uint64_t now = gethrestime_sec();
 	ddt_prune_info_t dpi;
 	boolean_t pruning = (cutoff != 0);
 
 	if (pruning) {
 		dpi.dpi_txg_syncs = 0;
 		dpi.dpi_pruned = 0;
 		dpi.dpi_spa = spa;
 		list_create(&dpi.dpi_candidates, sizeof (ddt_prune_entry_t),
 		    offsetof(ddt_prune_entry_t, dpe_node));
 	}
 
 	if (histogram != NULL)
 		memset(histogram, 0, sizeof (ddt_age_histo_t));
 
 	while ((error =
 	    ddt_walk_impl(spa, &ddb, &ddlwe, DDT_FLAG_FLAT, B_FALSE)) == 0) {
 		ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
 		VERIFY(ddt);
 
 		if (spa_shutting_down(spa) || issig())
 			break;
 
 		ASSERT(ddt->ddt_flags & DDT_FLAG_FLAT);
 		ASSERT3U(ddlwe.ddlwe_phys.ddp_flat.ddp_refcnt, <=, 1);
 
 		uint64_t class_start =
 		    ddlwe.ddlwe_phys.ddp_flat.ddp_class_start;
 
 		/*
 		 * If this entry is on the log, then the stored entry is stale
 		 * and we should skip it.
 		 */
 		if (ddt_log_find_key(ddt, &ddlwe.ddlwe_key, NULL))
 			continue;
 
 		/* prune older entries */
 		if (pruning && class_start < cutoff) {
 			if (candidates++ >= zfs_ddt_prunes_per_txg) {
 				/* sync prune candidates in batches */
 				VERIFY0(dsl_sync_task(spa_name(spa),
 				    NULL, prune_candidates_sync,
 				    &dpi, 0, ZFS_SPACE_CHECK_NONE));
 				candidates = 1;
 			}
 			ddt_prune_entry(&dpi.dpi_candidates, ddt,
 			    &ddlwe.ddlwe_key, &ddlwe.ddlwe_phys);
 		}
 
 		/* build a histogram */
 		if (histogram != NULL) {
 			uint64_t age = MAX(1, (now - class_start) / 3600);
 			int bin = MIN(highbit64(age) - 1, HIST_BINS - 1);
 			histogram->dah_entries++;
 			histogram->dah_age_histo[bin]++;
 		}
 
 		valid++;
 	}
 
 	if (pruning && valid > 0) {
 		if (!list_is_empty(&dpi.dpi_candidates)) {
 			/* sync out final batch of prune candidates */
 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 			    prune_candidates_sync, &dpi, 0,
 			    ZFS_SPACE_CHECK_NONE));
 		}
 		list_destroy(&dpi.dpi_candidates);
 
 		zfs_dbgmsg("pruned %llu entries (%d%%) across %llu txg syncs",
 		    (u_longlong_t)dpi.dpi_pruned,
 		    (int)((dpi.dpi_pruned * 100) / valid),
 		    (u_longlong_t)dpi.dpi_txg_syncs);
 	}
 }
 
 static uint64_t
 ddt_total_entries(spa_t *spa)
 {
 	ddt_object_t ddo;
 	ddt_get_dedup_object_stats(spa, &ddo);
 
 	return (ddo.ddo_count);
 }
 
 int
 ddt_prune_unique_entries(spa_t *spa, zpool_ddt_prune_unit_t unit,
     uint64_t amount)
 {
 	uint64_t cutoff;
 	uint64_t start_time = gethrtime();
 
 	if (spa->spa_active_ddt_prune)
 		return (SET_ERROR(EALREADY));
 	if (ddt_total_entries(spa) == 0)
 		return (0);
 
 	spa->spa_active_ddt_prune = B_TRUE;
 
 	zfs_dbgmsg("prune %llu %s", (u_longlong_t)amount,
 	    unit == ZPOOL_DDT_PRUNE_PERCENTAGE ? "%" : "seconds old or older");
 
 	if (unit == ZPOOL_DDT_PRUNE_PERCENTAGE) {
 		ddt_age_histo_t histogram;
 		uint64_t oldest = 0;
 
 		/* Make a pass over DDT to build a histogram */
 		ddt_prune_walk(spa, 0, &histogram);
 
 		int target = (histogram.dah_entries * amount) / 100;
 
 		/*
 		 * Figure out our cutoff date
 		 * (i.e., which bins to prune from)
 		 */
 		for (int i = HIST_BINS - 1; i >= 0 && target > 0; i--) {
 			if (histogram.dah_age_histo[i] != 0) {
 				/* less than this bucket remaining */
 				if (target < histogram.dah_age_histo[i]) {
 					oldest = MAX(1, (1<<i) * 3600);
 					target = 0;
 				} else {
 					target -= histogram.dah_age_histo[i];
 				}
 			}
 		}
 		cutoff = gethrestime_sec() - oldest;
 
 		if (ddt_dump_prune_histogram)
 			ddt_dump_age_histogram(&histogram, cutoff);
 	} else if (unit == ZPOOL_DDT_PRUNE_AGE) {
 		cutoff = gethrestime_sec() - amount;
 	} else {
 		return (EINVAL);
 	}
 
 	if (cutoff > 0 && !spa_shutting_down(spa) && !issig()) {
 		/* Traverse DDT to prune entries older that our cuttoff */
 		ddt_prune_walk(spa, cutoff, NULL);
 	}
 
 	zfs_dbgmsg("%s: prune completed in %llu ms",
 	    spa_name(spa), (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
 
 	spa->spa_active_ddt_prune = B_FALSE;
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
 	"Enable prefetching dedup-ed blks");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_min_time_ms, UINT, ZMOD_RW,
 	"Min time to spend on incremental dedup log flush each transaction");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_min, UINT, ZMOD_RW,
 	"Min number of log entries to flush each transaction");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_entries_max, UINT, ZMOD_RW,
 	"Max number of log entries to flush each transaction");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_txgs, UINT, ZMOD_RW,
 	"Number of TXGs to try to rotate the log in");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_cap, UINT, ZMOD_RW,
 	"Soft cap for the size of the current dedup log");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_hard_cap, UINT, ZMOD_RW,
 	"Whether to use the soft cap as a hard cap");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_flush_flow_rate_txgs, UINT, ZMOD_RW,
 	"Number of txgs to average flow rates across");
diff --git a/sys/contrib/openzfs/module/zfs/ddt_log.c b/sys/contrib/openzfs/module/zfs/ddt_log.c
index 3d30e244c1f7..c9217cef4f7d 100644
--- a/sys/contrib/openzfs/module/zfs/ddt_log.c
+++ b/sys/contrib/openzfs/module/zfs/ddt_log.c
@@ -1,779 +1,781 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2023, Klara Inc.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/ddt.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu.h>
 #include <sys/ddt_impl.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 
 /*
  * No more than this many txgs before swapping logs.
  */
 uint_t zfs_dedup_log_txg_max = 8;
 
 /*
  * Max memory for the log AVL trees. If zfs_dedup_log_mem_max is zero at module
  * load, it will be set to zfs_dedup_log_mem_max_percent% of total memory.
  */
 uint64_t zfs_dedup_log_mem_max = 0;
 uint_t zfs_dedup_log_mem_max_percent = 1;
 
 
 static kmem_cache_t *ddt_log_entry_flat_cache;
 static kmem_cache_t *ddt_log_entry_trad_cache;
 
 #define	DDT_LOG_ENTRY_FLAT_SIZE	\
 	(sizeof (ddt_log_entry_t) + DDT_FLAT_PHYS_SIZE)
 #define	DDT_LOG_ENTRY_TRAD_SIZE	\
 	(sizeof (ddt_log_entry_t) + DDT_TRAD_PHYS_SIZE)
 
 #define	DDT_LOG_ENTRY_SIZE(ddt)	\
 	_DDT_PHYS_SWITCH(ddt, DDT_LOG_ENTRY_FLAT_SIZE, DDT_LOG_ENTRY_TRAD_SIZE)
 
 void
 ddt_log_init(void)
 {
 	ddt_log_entry_flat_cache = kmem_cache_create("ddt_log_entry_flat_cache",
 	    DDT_LOG_ENTRY_FLAT_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 	ddt_log_entry_trad_cache = kmem_cache_create("ddt_log_entry_trad_cache",
 	    DDT_LOG_ENTRY_TRAD_SIZE, 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	/*
 	 * Max memory for log AVL entries. At least 1M, because we need
 	 * something (that's ~3800 entries per tree). They can say 100% if they
 	 * want; it just means they're at the mercy of the the txg flush limit.
 	 */
 	if (zfs_dedup_log_mem_max == 0) {
 		zfs_dedup_log_mem_max_percent =
 		    MIN(zfs_dedup_log_mem_max_percent, 100);
 		zfs_dedup_log_mem_max = (physmem * PAGESIZE) *
 		    zfs_dedup_log_mem_max_percent / 100;
 	}
 	zfs_dedup_log_mem_max = MAX(zfs_dedup_log_mem_max, 1*1024*1024);
 }
 
 void
 ddt_log_fini(void)
 {
 	kmem_cache_destroy(ddt_log_entry_trad_cache);
 	kmem_cache_destroy(ddt_log_entry_flat_cache);
 }
 
 static void
 ddt_log_name(ddt_t *ddt, char *name, uint_t n)
 {
 	snprintf(name, DDT_NAMELEN, DMU_POOL_DDT_LOG,
 	    zio_checksum_table[ddt->ddt_checksum].ci_name, n);
 }
 
 static void
 ddt_log_update_header(ddt_t *ddt, ddt_log_t *ddl, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	VERIFY0(dmu_bonus_hold(ddt->ddt_os, ddl->ddl_object, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 
 	ddt_log_header_t *hdr = (ddt_log_header_t *)db->db_data;
 	DLH_SET_VERSION(hdr, 1);
 	DLH_SET_FLAGS(hdr, ddl->ddl_flags);
 	hdr->dlh_length = ddl->ddl_length;
 	hdr->dlh_first_txg = ddl->ddl_first_txg;
 	hdr->dlh_checkpoint = ddl->ddl_checkpoint;
 
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 ddt_log_create_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
 {
 	ASSERT3U(ddt->ddt_dir_object, >, 0);
 	ASSERT0(ddl->ddl_object);
 
 	char name[DDT_NAMELEN];
 	ddt_log_name(ddt, name, n);
 
 	ddl->ddl_object = dmu_object_alloc(ddt->ddt_os,
 	    DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE,
 	    DMU_OTN_UINT64_METADATA, sizeof (ddt_log_header_t), tx);
 	VERIFY0(zap_add(ddt->ddt_os, ddt->ddt_dir_object, name,
 	    sizeof (uint64_t), 1, &ddl->ddl_object, tx));
 	ddl->ddl_length = 0;
 	ddl->ddl_first_txg = tx->tx_txg;
 	ddt_log_update_header(ddt, ddl, tx);
 }
 
 static void
 ddt_log_create(ddt_t *ddt, dmu_tx_t *tx)
 {
 	ddt_log_create_one(ddt, ddt->ddt_log_active, 0, tx);
 	ddt_log_create_one(ddt, ddt->ddt_log_flushing, 1, tx);
 }
 
 static void
 ddt_log_destroy_one(ddt_t *ddt, ddt_log_t *ddl, uint_t n, dmu_tx_t *tx)
 {
 	ASSERT3U(ddt->ddt_dir_object, >, 0);
 
 	if (ddl->ddl_object == 0)
 		return;
 
 	ASSERT0(ddl->ddl_length);
 
 	char name[DDT_NAMELEN];
 	ddt_log_name(ddt, name, n);
 
 	VERIFY0(zap_remove(ddt->ddt_os, ddt->ddt_dir_object, name, tx));
 	VERIFY0(dmu_object_free(ddt->ddt_os, ddl->ddl_object, tx));
 
 	ddl->ddl_object = 0;
 }
 
 void
 ddt_log_destroy(ddt_t *ddt, dmu_tx_t *tx)
 {
 	ddt_log_destroy_one(ddt, ddt->ddt_log_active, 0, tx);
 	ddt_log_destroy_one(ddt, ddt->ddt_log_flushing, 1, tx);
 }
 
 static void
 ddt_log_update_stats(ddt_t *ddt)
 {
 	/*
 	 * Log object stats. We count the number of live entries in the log
 	 * tree, even if there are more than on disk, and even if the same
 	 * entry is on both append and flush trees, because that's more what
 	 * the user expects to see. This does mean the on-disk size is not
 	 * really correlated with the number of entries, but I don't think
 	 * that's reasonable to expect anyway.
 	 */
 	dmu_object_info_t doi;
-	uint64_t nblocks;
-	dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object, &doi);
-	nblocks = doi.doi_physical_blocks_512;
-	dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object, &doi);
-	nblocks += doi.doi_physical_blocks_512;
+	uint64_t nblocks = 0;
+	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_active->ddl_object,
+	    &doi) == 0)
+		nblocks += doi.doi_physical_blocks_512;
+	if (dmu_object_info(ddt->ddt_os, ddt->ddt_log_flushing->ddl_object,
+	    &doi) == 0)
+		nblocks += doi.doi_physical_blocks_512;
 
 	ddt_object_t *ddo = &ddt->ddt_log_stats;
 	ddo->ddo_count =
 	    avl_numnodes(&ddt->ddt_log_active->ddl_tree) +
 	    avl_numnodes(&ddt->ddt_log_flushing->ddl_tree);
 	ddo->ddo_mspace = ddo->ddo_count * DDT_LOG_ENTRY_SIZE(ddt);
 	ddo->ddo_dspace = nblocks << 9;
 }
 
 void
 ddt_log_begin(ddt_t *ddt, size_t nentries, dmu_tx_t *tx, ddt_log_update_t *dlu)
 {
 	ASSERT3U(nentries, >, 0);
 	ASSERT0P(dlu->dlu_dbp);
 
 	if (ddt->ddt_log_active->ddl_object == 0)
 		ddt_log_create(ddt, tx);
 
 	/*
 	 * We want to store as many entries as we can in a block, but never
 	 * split an entry across block boundaries.
 	 */
 	size_t reclen = P2ALIGN_TYPED(
 	    sizeof (ddt_log_record_t) + sizeof (ddt_log_record_entry_t) +
 	    DDT_PHYS_SIZE(ddt), sizeof (uint64_t), size_t);
 	ASSERT3U(reclen, <=, UINT16_MAX);
 	dlu->dlu_reclen = reclen;
 
 	VERIFY0(dnode_hold(ddt->ddt_os, ddt->ddt_log_active->ddl_object, FTAG,
 	    &dlu->dlu_dn));
 	dnode_set_storage_type(dlu->dlu_dn, DMU_OT_DDT_ZAP);
 
 	uint64_t nblocks = howmany(nentries,
 	    dlu->dlu_dn->dn_datablksz / dlu->dlu_reclen);
 	uint64_t offset = ddt->ddt_log_active->ddl_length;
 	uint64_t length = nblocks * dlu->dlu_dn->dn_datablksz;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dlu->dlu_dn, offset, length,
 	    B_FALSE, FTAG, &dlu->dlu_ndbp, &dlu->dlu_dbp,
 	    DMU_READ_NO_PREFETCH));
 
 	dlu->dlu_tx = tx;
 	dlu->dlu_block = dlu->dlu_offset = 0;
 }
 
 static ddt_log_entry_t *
 ddt_log_alloc_entry(ddt_t *ddt)
 {
 	ddt_log_entry_t *ddle;
 
 	if (ddt->ddt_flags & DDT_FLAG_FLAT) {
 		ddle = kmem_cache_alloc(ddt_log_entry_flat_cache, KM_SLEEP);
 		memset(ddle, 0, DDT_LOG_ENTRY_FLAT_SIZE);
 	} else {
 		ddle = kmem_cache_alloc(ddt_log_entry_trad_cache, KM_SLEEP);
 		memset(ddle, 0, DDT_LOG_ENTRY_TRAD_SIZE);
 	}
 
 	return (ddle);
 }
 
 static void
 ddt_log_update_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 {
 	/* Create the log tree entry from a live or stored entry */
 	avl_index_t where;
 	ddt_log_entry_t *ddle =
 	    avl_find(&ddl->ddl_tree, &ddlwe->ddlwe_key, &where);
 	if (ddle == NULL) {
 		ddle = ddt_log_alloc_entry(ddt);
 		ddle->ddle_key = ddlwe->ddlwe_key;
 		avl_insert(&ddl->ddl_tree, ddle, where);
 	}
 	ddle->ddle_type = ddlwe->ddlwe_type;
 	ddle->ddle_class = ddlwe->ddlwe_class;
 	memcpy(ddle->ddle_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
 }
 
 void
 ddt_log_entry(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, ddt_log_update_t *dlu)
 {
 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
 
 	ddt_log_update_entry(ddt, ddt->ddt_log_active, ddlwe);
 	ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
 
 	/* Get our block */
 	ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
 	dmu_buf_t *db = dlu->dlu_dbp[dlu->dlu_block];
 
 	/*
 	 * If this would take us past the end of the block, finish it and
 	 * move to the next one.
 	 */
 	if (db->db_size < (dlu->dlu_offset + dlu->dlu_reclen)) {
 		ASSERT3U(dlu->dlu_offset, >, 0);
 		dmu_buf_fill_done(db, dlu->dlu_tx, B_FALSE);
 		dlu->dlu_block++;
 		dlu->dlu_offset = 0;
 		ASSERT3U(dlu->dlu_block, <, dlu->dlu_ndbp);
 		db = dlu->dlu_dbp[dlu->dlu_block];
 	}
 
 	/*
 	 * If this is the first time touching the block, inform the DMU that
 	 * we will fill it, and zero it out.
 	 */
 	if (dlu->dlu_offset == 0) {
 		dmu_buf_will_fill(db, dlu->dlu_tx, B_FALSE);
 		memset(db->db_data, 0, db->db_size);
 	}
 
 	/* Create the log record directly in the buffer */
 	ddt_log_record_t *dlr = (db->db_data + dlu->dlu_offset);
 	DLR_SET_TYPE(dlr, DLR_ENTRY);
 	DLR_SET_RECLEN(dlr, dlu->dlu_reclen);
 	DLR_SET_ENTRY_TYPE(dlr, ddlwe->ddlwe_type);
 	DLR_SET_ENTRY_CLASS(dlr, ddlwe->ddlwe_class);
 
 	ddt_log_record_entry_t *dlre =
 	    (ddt_log_record_entry_t *)&dlr->dlr_payload;
 	dlre->dlre_key = ddlwe->ddlwe_key;
 	memcpy(dlre->dlre_phys, &ddlwe->ddlwe_phys, DDT_PHYS_SIZE(ddt));
 
 	/* Advance offset for next record. */
 	dlu->dlu_offset += dlu->dlu_reclen;
 }
 
 void
 ddt_log_commit(ddt_t *ddt, ddt_log_update_t *dlu)
 {
 	ASSERT3U(dlu->dlu_dbp, !=, NULL);
 	ASSERT3U(dlu->dlu_block+1, ==, dlu->dlu_ndbp);
 	ASSERT3U(dlu->dlu_offset, >, 0);
 
 	/*
 	 * Close out the last block. Whatever we haven't used will be zeroed,
 	 * which matches DLR_INVALID, so we can detect this during load.
 	 */
 	dmu_buf_fill_done(dlu->dlu_dbp[dlu->dlu_block], dlu->dlu_tx, B_FALSE);
 
 	dmu_buf_rele_array(dlu->dlu_dbp, dlu->dlu_ndbp, FTAG);
 
 	ddt->ddt_log_active->ddl_length +=
 	    dlu->dlu_ndbp * (uint64_t)dlu->dlu_dn->dn_datablksz;
 	dnode_rele(dlu->dlu_dn, FTAG);
 
 	ddt_log_update_header(ddt, ddt->ddt_log_active, dlu->dlu_tx);
 
 	memset(dlu, 0, sizeof (ddt_log_update_t));
 
 	ddt_log_update_stats(ddt);
 }
 
 boolean_t
 ddt_log_take_first(ddt_t *ddt, ddt_log_t *ddl, ddt_lightweight_entry_t *ddlwe)
 {
 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
 	if (ddle == NULL)
 		return (B_FALSE);
 
 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
 
 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
 
 	return (B_TRUE);
 }
 
 boolean_t
 ddt_log_remove_key(ddt_t *ddt, ddt_log_t *ddl, const ddt_key_t *ddk)
 {
 	ddt_log_entry_t *ddle = avl_find(&ddl->ddl_tree, ddk, NULL);
 	if (ddle == NULL)
 		return (B_FALSE);
 
 	ddt_lightweight_entry_t ddlwe;
 	DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
 	ddt_histogram_sub_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
 
 	avl_remove(&ddl->ddl_tree, ddle);
 	kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
 	    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
 
 	return (B_TRUE);
 }
 
 boolean_t
 ddt_log_find_key(ddt_t *ddt, const ddt_key_t *ddk,
     ddt_lightweight_entry_t *ddlwe)
 {
 	ddt_log_entry_t *ddle =
 	    avl_find(&ddt->ddt_log_active->ddl_tree, ddk, NULL);
 	if (!ddle)
 		ddle = avl_find(&ddt->ddt_log_flushing->ddl_tree, ddk, NULL);
 	if (!ddle)
 		return (B_FALSE);
 	if (ddlwe)
 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, ddlwe);
 	return (B_TRUE);
 }
 
 void
 ddt_log_checkpoint(ddt_t *ddt, ddt_lightweight_entry_t *ddlwe, dmu_tx_t *tx)
 {
 	ddt_log_t *ddl = ddt->ddt_log_flushing;
 
 	ASSERT3U(ddl->ddl_object, !=, 0);
 
 #ifdef ZFS_DEBUG
 	/*
 	 * There should not be any entries on the log tree before the given
 	 * checkpoint. Assert that this is the case.
 	 */
 	ddt_log_entry_t *ddle = avl_first(&ddl->ddl_tree);
 	if (ddle != NULL)
 		VERIFY3U(ddt_key_compare(&ddle->ddle_key, &ddlwe->ddlwe_key),
 		    >, 0);
 #endif
 
 	ddl->ddl_flags |= DDL_FLAG_CHECKPOINT;
 	ddl->ddl_checkpoint = ddlwe->ddlwe_key;
 	ddt_log_update_header(ddt, ddl, tx);
 
 	ddt_log_update_stats(ddt);
 }
 
 void
 ddt_log_truncate(ddt_t *ddt, dmu_tx_t *tx)
 {
 	ddt_log_t *ddl = ddt->ddt_log_flushing;
 
 	if (ddl->ddl_object == 0)
 		return;
 
 	ASSERT(avl_is_empty(&ddl->ddl_tree));
 
 	/* Eject the entire object */
 	dmu_free_range(ddt->ddt_os, ddl->ddl_object, 0, DMU_OBJECT_END, tx);
 
 	ddl->ddl_length = 0;
 	ddl->ddl_flags &= ~DDL_FLAG_CHECKPOINT;
 	memset(&ddl->ddl_checkpoint, 0, sizeof (ddt_key_t));
 	ddt_log_update_header(ddt, ddl, tx);
 
 	ddt_log_update_stats(ddt);
 }
 
 boolean_t
 ddt_log_swap(ddt_t *ddt, dmu_tx_t *tx)
 {
 	/* Swap the logs. The old flushing one must be empty */
 	VERIFY(avl_is_empty(&ddt->ddt_log_flushing->ddl_tree));
 
 	/*
 	 * If there are still blocks on the flushing log, truncate it first.
 	 * This can happen if there were entries on the flushing log that were
 	 * removed in memory via ddt_lookup(); their vestigal remains are
 	 * on disk.
 	 */
 	if (ddt->ddt_log_flushing->ddl_length > 0)
 		ddt_log_truncate(ddt, tx);
 
 	/*
 	 * Swap policy. We swap the logs (and so begin flushing) when the
 	 * active tree grows too large, or when we haven't swapped it in
 	 * some amount of time, or if something has requested the logs be
 	 * flushed ASAP (see ddt_walk_init()).
 	 */
 
 	/*
 	 * The log tree is too large if the memory usage of its entries is over
 	 * half of the memory limit. This effectively gives each log tree half
 	 * the available memory.
 	 */
 	const boolean_t too_large =
 	    (avl_numnodes(&ddt->ddt_log_active->ddl_tree) *
 	    DDT_LOG_ENTRY_SIZE(ddt)) >= (zfs_dedup_log_mem_max >> 1);
 
 	const boolean_t too_old =
 	    tx->tx_txg >=
 	    (ddt->ddt_log_active->ddl_first_txg +
 	    MAX(1, zfs_dedup_log_txg_max));
 
 	const boolean_t force =
 	    ddt->ddt_log_active->ddl_first_txg <= ddt->ddt_flush_force_txg;
 
 	if (!(too_large || too_old || force))
 		return (B_FALSE);
 
 	ddt_log_t *swap = ddt->ddt_log_active;
 	ddt->ddt_log_active = ddt->ddt_log_flushing;
 	ddt->ddt_log_flushing = swap;
 
 	ASSERT(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING);
 	ddt->ddt_log_active->ddl_flags &=
 	    ~(DDL_FLAG_FLUSHING | DDL_FLAG_CHECKPOINT);
 
 	ASSERT(!(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING));
 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
 
 	ddt->ddt_log_active->ddl_first_txg = tx->tx_txg;
 
 	ddt_log_update_header(ddt, ddt->ddt_log_active, tx);
 	ddt_log_update_header(ddt, ddt->ddt_log_flushing, tx);
 
 	ddt_log_update_stats(ddt);
 
 	return (B_TRUE);
 }
 
 static inline void
 ddt_log_load_entry(ddt_t *ddt, ddt_log_t *ddl, ddt_log_record_t *dlr,
     const ddt_key_t *checkpoint)
 {
 	ASSERT3U(DLR_GET_TYPE(dlr), ==, DLR_ENTRY);
 
 	ddt_log_record_entry_t *dlre =
 	    (ddt_log_record_entry_t *)dlr->dlr_payload;
 	if (checkpoint != NULL &&
 	    ddt_key_compare(&dlre->dlre_key, checkpoint) <= 0) {
 		/* Skip pre-checkpoint entries; they're already flushed. */
 		return;
 	}
 
 	ddt_lightweight_entry_t ddlwe;
 	ddlwe.ddlwe_type = DLR_GET_ENTRY_TYPE(dlr);
 	ddlwe.ddlwe_class = DLR_GET_ENTRY_CLASS(dlr);
 
 	ddlwe.ddlwe_key = dlre->dlre_key;
 	memcpy(&ddlwe.ddlwe_phys, dlre->dlre_phys, DDT_PHYS_SIZE(ddt));
 
 	ddt_log_update_entry(ddt, ddl, &ddlwe);
 }
 
 static void
 ddt_log_empty(ddt_t *ddt, ddt_log_t *ddl)
 {
 	void *cookie = NULL;
 	ddt_log_entry_t *ddle;
 	IMPLY(ddt->ddt_version == UINT64_MAX, avl_is_empty(&ddl->ddl_tree));
 	while ((ddle =
 	    avl_destroy_nodes(&ddl->ddl_tree, &cookie)) != NULL) {
 		kmem_cache_free(ddt->ddt_flags & DDT_FLAG_FLAT ?
 		    ddt_log_entry_flat_cache : ddt_log_entry_trad_cache, ddle);
 	}
 	ASSERT(avl_is_empty(&ddl->ddl_tree));
 }
 
 static int
 ddt_log_load_one(ddt_t *ddt, uint_t n)
 {
 	ASSERT3U(n, <, 2);
 
 	ddt_log_t *ddl = &ddt->ddt_log[n];
 
 	char name[DDT_NAMELEN];
 	ddt_log_name(ddt, name, n);
 
 	uint64_t obj;
 	int err = zap_lookup(ddt->ddt_os, ddt->ddt_dir_object, name,
 	    sizeof (uint64_t), 1, &obj);
 	if (err == ENOENT)
 		return (0);
 	if (err != 0)
 		return (err);
 
 	dnode_t *dn;
 	err = dnode_hold(ddt->ddt_os, obj, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	ddt_log_header_t hdr;
 	dmu_buf_t *db;
 	err = dmu_bonus_hold_by_dnode(dn, FTAG, &db, DMU_READ_NO_PREFETCH);
 	if (err != 0) {
 		dnode_rele(dn, FTAG);
 		return (err);
 	}
 	memcpy(&hdr, db->db_data, sizeof (ddt_log_header_t));
 	dmu_buf_rele(db, FTAG);
 
 	if (DLH_GET_VERSION(&hdr) != 1) {
 		dnode_rele(dn, FTAG);
 		zfs_dbgmsg("ddt_log_load: spa=%s ddt_log=%s "
 		    "unknown version=%llu", spa_name(ddt->ddt_spa), name,
 		    (u_longlong_t)DLH_GET_VERSION(&hdr));
 		return (SET_ERROR(EINVAL));
 	}
 
 	ddt_key_t *checkpoint = NULL;
 	if (DLH_GET_FLAGS(&hdr) & DDL_FLAG_CHECKPOINT) {
 		/*
 		 * If the log has a checkpoint, then we can ignore any entries
 		 * that have already been flushed.
 		 */
 		ASSERT(DLH_GET_FLAGS(&hdr) & DDL_FLAG_FLUSHING);
 		checkpoint = &hdr.dlh_checkpoint;
 	}
 
 	if (hdr.dlh_length > 0) {
 		dmu_prefetch_by_dnode(dn, 0, 0, hdr.dlh_length,
 		    ZIO_PRIORITY_SYNC_READ);
 
 		for (uint64_t offset = 0; offset < hdr.dlh_length;
 		    offset += dn->dn_datablksz) {
 			err = dmu_buf_hold_by_dnode(dn, offset, FTAG, &db,
 			    DMU_READ_PREFETCH);
 			if (err != 0) {
 				dnode_rele(dn, FTAG);
 				ddt_log_empty(ddt, ddl);
 				return (err);
 			}
 
 			uint64_t boffset = 0;
 			while (boffset < db->db_size) {
 				ddt_log_record_t *dlr =
 				    (ddt_log_record_t *)(db->db_data + boffset);
 
 				/* Partially-filled block, skip the rest */
 				if (DLR_GET_TYPE(dlr) == DLR_INVALID)
 					break;
 
 				switch (DLR_GET_TYPE(dlr)) {
 				case DLR_ENTRY:
 					ddt_log_load_entry(ddt, ddl, dlr,
 					    checkpoint);
 					break;
 
 				default:
 					dmu_buf_rele(db, FTAG);
 					dnode_rele(dn, FTAG);
 					ddt_log_empty(ddt, ddl);
 					return (SET_ERROR(EINVAL));
 				}
 
 				boffset += DLR_GET_RECLEN(dlr);
 			}
 
 			dmu_buf_rele(db, FTAG);
 		}
 	}
 
 	dnode_rele(dn, FTAG);
 
 	ddl->ddl_object = obj;
 	ddl->ddl_flags = DLH_GET_FLAGS(&hdr);
 	ddl->ddl_length = hdr.dlh_length;
 	ddl->ddl_first_txg = hdr.dlh_first_txg;
 
 	if (ddl->ddl_flags & DDL_FLAG_FLUSHING)
 		ddt->ddt_log_flushing = ddl;
 	else
 		ddt->ddt_log_active = ddl;
 
 	return (0);
 }
 
 int
 ddt_log_load(ddt_t *ddt)
 {
 	int err;
 
 	if (spa_load_state(ddt->ddt_spa) == SPA_LOAD_TRYIMPORT) {
 		/*
 		 * The DDT is going to be freed again in a moment, so there's
 		 * no point loading the log; it'll just slow down import.
 		 */
 		return (0);
 	}
 
 	ASSERT0(ddt->ddt_log[0].ddl_object);
 	ASSERT0(ddt->ddt_log[1].ddl_object);
 	if (ddt->ddt_dir_object == 0) {
 		/*
 		 * If we're configured but the containing dir doesn't exist
 		 * yet, then the log object can't possibly exist either.
 		 */
 		ASSERT3U(ddt->ddt_version, !=, UINT64_MAX);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if ((err = ddt_log_load_one(ddt, 0)) != 0)
 		return (err);
 	if ((err = ddt_log_load_one(ddt, 1)) != 0)
 		return (err);
 
 	VERIFY3P(ddt->ddt_log_active, !=, ddt->ddt_log_flushing);
 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_FLUSHING));
 	VERIFY(!(ddt->ddt_log_active->ddl_flags & DDL_FLAG_CHECKPOINT));
 	VERIFY(ddt->ddt_log_flushing->ddl_flags & DDL_FLAG_FLUSHING);
 
 	/*
 	 * We have two finalisation tasks:
 	 *
 	 * - rebuild the histogram. We do this at the end rather than while
 	 *   we're loading so we don't need to uncount and recount entries that
 	 *   appear multiple times in the log.
 	 *
 	 * - remove entries from the flushing tree that are on both trees. This
 	 *   happens when ddt_lookup() rehydrates an entry from the flushing
 	 *   tree, as ddt_log_take_key() removes the entry from the in-memory
 	 *   tree but doesn't remove it from disk.
 	 */
 
 	/*
 	 * We don't technically need a config lock here, since there shouldn't
 	 * be pool config changes during DDT load. dva_get_dsize_sync() via
 	 * ddt_stat_generate() is expecting it though, and it won't hurt
 	 * anything, so we take it.
 	 */
 	spa_config_enter(ddt->ddt_spa, SCL_STATE, FTAG, RW_READER);
 
 	avl_tree_t *al = &ddt->ddt_log_active->ddl_tree;
 	avl_tree_t *fl = &ddt->ddt_log_flushing->ddl_tree;
 	ddt_log_entry_t *ae = avl_first(al);
 	ddt_log_entry_t *fe = avl_first(fl);
 	while (ae != NULL || fe != NULL) {
 		ddt_log_entry_t *ddle;
 		if (ae == NULL) {
 			/* active exhausted, take flushing */
 			ddle = fe;
 			fe = AVL_NEXT(fl, fe);
 		} else if (fe == NULL) {
 			/* flushing exuhausted, take active */
 			ddle = ae;
 			ae = AVL_NEXT(al, ae);
 		} else {
 			/* compare active and flushing */
 			int c = ddt_key_compare(&ae->ddle_key, &fe->ddle_key);
 			if (c < 0) {
 				/* active behind, take and advance */
 				ddle = ae;
 				ae = AVL_NEXT(al, ae);
 			} else if (c > 0) {
 				/* flushing behind, take and advance */
 				ddle = fe;
 				fe = AVL_NEXT(fl, fe);
 			} else {
 				/* match. remove from flushing, take active */
 				ddle = fe;
 				fe = AVL_NEXT(fl, fe);
 				avl_remove(fl, ddle);
 
 				ddle = ae;
 				ae = AVL_NEXT(al, ae);
 			}
 		}
 
 		ddt_lightweight_entry_t ddlwe;
 		DDT_LOG_ENTRY_TO_LIGHTWEIGHT(ddt, ddle, &ddlwe);
 		ddt_histogram_add_entry(ddt, &ddt->ddt_log_histogram, &ddlwe);
 	}
 
 	spa_config_exit(ddt->ddt_spa, SCL_STATE, FTAG);
 
 	ddt_log_update_stats(ddt);
 
 	return (0);
 }
 
 void
 ddt_log_alloc(ddt_t *ddt)
 {
 	ASSERT0P(ddt->ddt_log_active);
 	ASSERT0P(ddt->ddt_log_flushing);
 
 	avl_create(&ddt->ddt_log[0].ddl_tree, ddt_key_compare,
 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
 	avl_create(&ddt->ddt_log[1].ddl_tree, ddt_key_compare,
 	    sizeof (ddt_log_entry_t), offsetof(ddt_log_entry_t, ddle_node));
 	ddt->ddt_log_active = &ddt->ddt_log[0];
 	ddt->ddt_log_flushing = &ddt->ddt_log[1];
 	ddt->ddt_log_flushing->ddl_flags |= DDL_FLAG_FLUSHING;
 }
 
 void
 ddt_log_free(ddt_t *ddt)
 {
 	ddt_log_empty(ddt, &ddt->ddt_log[0]);
 	ddt_log_empty(ddt, &ddt->ddt_log[1]);
 	avl_destroy(&ddt->ddt_log[0].ddl_tree);
 	avl_destroy(&ddt->ddt_log[1].ddl_tree);
 }
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_txg_max, UINT, ZMOD_RW,
 	"Max transactions before starting to flush dedup logs");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max, U64, ZMOD_RD,
 	"Max memory for dedup logs");
 
 ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, log_mem_max_percent, UINT, ZMOD_RD,
 	"Max memory for dedup logs, as % of total memory");
diff --git a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
index 51165d0bf723..3d3a9c713568 100644
--- a/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
+++ b/sys/contrib/openzfs/module/zfs/dmu_zfetch.c
@@ -1,799 +1,799 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/arc_impl.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/dmu.h>
 #include <sys/dbuf.h>
 #include <sys/kstat.h>
 #include <sys/wmsum.h>
 
 /*
  * This tunable disables predictive prefetch.  Note that it leaves "prescient"
  * prefetch (e.g. prefetch for zfs send) intact.  Unlike predictive prefetch,
  * prescient prefetch never issues i/os that end up not being needed,
  * so it can't hurt performance.
  */
 
 static int zfs_prefetch_disable = B_FALSE;
 
 /* max # of streams per zfetch */
 static unsigned int	zfetch_max_streams = 8;
 /* min time before stream reclaim */
 static unsigned int	zfetch_min_sec_reap = 1;
 /* max time before stream delete */
 static unsigned int	zfetch_max_sec_reap = 2;
 #ifdef _ILP32
 /* min bytes to prefetch per stream (default 2MB) */
 static unsigned int	zfetch_min_distance = 2 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 8MB) */
-unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
+static unsigned int	zfetch_max_distance = 8 * 1024 * 1024;
 #else
 /* min bytes to prefetch per stream (default 4MB) */
 static unsigned int	zfetch_min_distance = 4 * 1024 * 1024;
 /* max bytes to prefetch per stream (default 64MB) */
-unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
+static unsigned int	zfetch_max_distance = 64 * 1024 * 1024;
 #endif
 /* max bytes to prefetch indirects for per stream (default 128MB) */
-unsigned int	zfetch_max_idistance = 128 * 1024 * 1024;
+static unsigned int	zfetch_max_idistance = 128 * 1024 * 1024;
 /* max request reorder distance within a stream (default 16MB) */
-unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
+static unsigned int	zfetch_max_reorder = 16 * 1024 * 1024;
 /* Max log2 fraction of holes in a stream */
-unsigned int	zfetch_hole_shift = 2;
+static unsigned int	zfetch_hole_shift = 2;
 
 typedef struct zfetch_stats {
 	kstat_named_t zfetchstat_hits;
 	kstat_named_t zfetchstat_future;
 	kstat_named_t zfetchstat_stride;
 	kstat_named_t zfetchstat_past;
 	kstat_named_t zfetchstat_misses;
 	kstat_named_t zfetchstat_max_streams;
 	kstat_named_t zfetchstat_io_issued;
 	kstat_named_t zfetchstat_io_active;
 } zfetch_stats_t;
 
 static zfetch_stats_t zfetch_stats = {
 	{ "hits",			KSTAT_DATA_UINT64 },
 	{ "future",			KSTAT_DATA_UINT64 },
 	{ "stride",			KSTAT_DATA_UINT64 },
 	{ "past",			KSTAT_DATA_UINT64 },
 	{ "misses",			KSTAT_DATA_UINT64 },
 	{ "max_streams",		KSTAT_DATA_UINT64 },
 	{ "io_issued",			KSTAT_DATA_UINT64 },
 	{ "io_active",			KSTAT_DATA_UINT64 },
 };
 
 struct {
 	wmsum_t zfetchstat_hits;
 	wmsum_t zfetchstat_future;
 	wmsum_t zfetchstat_stride;
 	wmsum_t zfetchstat_past;
 	wmsum_t zfetchstat_misses;
 	wmsum_t zfetchstat_max_streams;
 	wmsum_t zfetchstat_io_issued;
 	aggsum_t zfetchstat_io_active;
 } zfetch_sums;
 
 #define	ZFETCHSTAT_BUMP(stat)					\
 	wmsum_add(&zfetch_sums.stat, 1)
 #define	ZFETCHSTAT_ADD(stat, val)				\
 	wmsum_add(&zfetch_sums.stat, val)
 
 
 static kstat_t		*zfetch_ksp;
 
 static int
 zfetch_kstats_update(kstat_t *ksp, int rw)
 {
 	zfetch_stats_t *zs = ksp->ks_data;
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 	zs->zfetchstat_hits.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_hits);
 	zs->zfetchstat_future.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_future);
 	zs->zfetchstat_stride.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_stride);
 	zs->zfetchstat_past.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_past);
 	zs->zfetchstat_misses.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_misses);
 	zs->zfetchstat_max_streams.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_max_streams);
 	zs->zfetchstat_io_issued.value.ui64 =
 	    wmsum_value(&zfetch_sums.zfetchstat_io_issued);
 	zs->zfetchstat_io_active.value.ui64 =
 	    aggsum_value(&zfetch_sums.zfetchstat_io_active);
 	return (0);
 }
 
 void
 zfetch_init(void)
 {
 	wmsum_init(&zfetch_sums.zfetchstat_hits, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_future, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_stride, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_past, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_misses, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0);
 	wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0);
 	aggsum_init(&zfetch_sums.zfetchstat_io_active, 0);
 
 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (zfetch_ksp != NULL) {
 		zfetch_ksp->ks_data = &zfetch_stats;
 		zfetch_ksp->ks_update = zfetch_kstats_update;
 		kstat_install(zfetch_ksp);
 	}
 }
 
 void
 zfetch_fini(void)
 {
 	if (zfetch_ksp != NULL) {
 		kstat_delete(zfetch_ksp);
 		zfetch_ksp = NULL;
 	}
 
 	wmsum_fini(&zfetch_sums.zfetchstat_hits);
 	wmsum_fini(&zfetch_sums.zfetchstat_future);
 	wmsum_fini(&zfetch_sums.zfetchstat_stride);
 	wmsum_fini(&zfetch_sums.zfetchstat_past);
 	wmsum_fini(&zfetch_sums.zfetchstat_misses);
 	wmsum_fini(&zfetch_sums.zfetchstat_max_streams);
 	wmsum_fini(&zfetch_sums.zfetchstat_io_issued);
 	ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active));
 	aggsum_fini(&zfetch_sums.zfetchstat_io_active);
 }
 
 /*
  * This takes a pointer to a zfetch structure and a dnode.  It performs the
  * necessary setup for the zfetch structure, grokking data from the
  * associated dnode.
  */
 void
 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
 {
 	if (zf == NULL)
 		return;
 	zf->zf_dnode = dno;
 	zf->zf_numstreams = 0;
 
 	list_create(&zf->zf_stream, sizeof (zstream_t),
 	    offsetof(zstream_t, zs_node));
 
 	mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL);
 }
 
 static void
 dmu_zfetch_stream_fini(zstream_t *zs)
 {
 	ASSERT(!list_link_active(&zs->zs_node));
 	zfs_refcount_destroy(&zs->zs_callers);
 	zfs_refcount_destroy(&zs->zs_refs);
 	kmem_free(zs, sizeof (*zs));
 }
 
 static void
 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
 {
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 	list_remove(&zf->zf_stream, zs);
 	zf->zf_numstreams--;
 	membar_producer();
 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 		dmu_zfetch_stream_fini(zs);
 }
 
 /*
  * Clean-up state associated with a zfetch structure (e.g. destroy the
  * streams).  This doesn't free the zfetch_t itself, that's left to the caller.
  */
 void
 dmu_zfetch_fini(zfetch_t *zf)
 {
 	zstream_t *zs;
 
 	mutex_enter(&zf->zf_lock);
 	while ((zs = list_head(&zf->zf_stream)) != NULL)
 		dmu_zfetch_stream_remove(zf, zs);
 	mutex_exit(&zf->zf_lock);
 	list_destroy(&zf->zf_stream);
 	mutex_destroy(&zf->zf_lock);
 
 	zf->zf_dnode = NULL;
 }
 
 /*
  * If there aren't too many active streams already, create one more.
  * In process delete/reuse all streams without hits for zfetch_max_sec_reap.
  * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever.
  * The "blkid" argument is the next block that we expect this stream to access.
  */
 static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
 	zstream_t *zs, *zs_next, *zs_old = NULL;
 	uint_t now = gethrestime_sec(), t;
 
 	ASSERT(MUTEX_HELD(&zf->zf_lock));
 
 	/*
 	 * Delete too old streams, reusing the first found one.
 	 */
 	t = now - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) {
 		zs_next = list_next(&zf->zf_stream, zs);
 		/*
 		 * Skip if still active.  1 -- zf_stream reference.
 		 */
 		if ((int)(zs->zs_atime - t) >= 0)
 			continue;
 		if (zfs_refcount_count(&zs->zs_refs) != 1)
 			continue;
 		if (zs_old)
 			dmu_zfetch_stream_remove(zf, zs);
 		else
 			zs_old = zs;
 	}
 	if (zs_old) {
 		zs = zs_old;
 		list_remove(&zf->zf_stream, zs);
 		goto reuse;
 	}
 
 	/*
 	 * The maximum number of streams is normally zfetch_max_streams,
 	 * but for small files we lower it such that it's at least possible
 	 * for all the streams to be non-overlapping.
 	 */
 	uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
 	    (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) /
 	    zfetch_max_distance));
 	if (zf->zf_numstreams >= max_streams) {
 		t = now - zfetch_min_sec_reap;
 		for (zs = list_head(&zf->zf_stream); zs != NULL;
 		    zs = list_next(&zf->zf_stream, zs)) {
 			if ((int)(zs->zs_atime - t) >= 0)
 				continue;
 			if (zfs_refcount_count(&zs->zs_refs) != 1)
 				continue;
 			if (zs_old == NULL ||
 			    (int)(zs_old->zs_atime - zs->zs_atime) >= 0)
 				zs_old = zs;
 		}
 		if (zs_old) {
 			zs = zs_old;
 			list_remove(&zf->zf_stream, zs);
 			goto reuse;
 		}
 		ZFETCHSTAT_BUMP(zfetchstat_max_streams);
 		return;
 	}
 
 	zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
 	zfs_refcount_create(&zs->zs_callers);
 	zfs_refcount_create(&zs->zs_refs);
 	/* One reference for zf_stream. */
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	zf->zf_numstreams++;
 
 reuse:
 	list_insert_head(&zf->zf_stream, zs);
 	zs->zs_blkid = blkid;
 	/* Allow immediate stream reuse until first hit. */
 	zs->zs_atime = now - zfetch_min_sec_reap;
 	memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges));
 	zs->zs_pf_dist = 0;
 	zs->zs_ipf_dist = 0;
 	zs->zs_pf_start = blkid;
 	zs->zs_pf_end = blkid;
 	zs->zs_ipf_start = blkid;
 	zs->zs_ipf_end = blkid;
 	zs->zs_missed = B_FALSE;
 	zs->zs_more = B_FALSE;
 }
 
 static void
 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued)
 {
 	zstream_t *zs = arg;
 
 	if (io_issued && level == 0 && blkid < zs->zs_blkid)
 		zs->zs_more = B_TRUE;
 	if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 		dmu_zfetch_stream_fini(zs);
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, -1);
 }
 
 /*
  * Process stream hit access for nblks blocks starting at zs_blkid.  Return
  * number of blocks to proceed for after aggregation with future ranges.
  */
 static uint64_t
 dmu_zfetch_hit(zstream_t *zs, uint64_t nblks)
 {
 	uint_t i, j;
 
 	/* Optimize sequential accesses (no future ranges). */
 	if (zs->zs_ranges[0].start == 0)
 		goto done;
 
 	/* Look for intersections with further ranges. */
 	for (i = 0; i < ZFETCH_RANGES; i++) {
 		zsrange_t *r = &zs->zs_ranges[i];
 		if (r->start == 0 || r->start > nblks)
 			break;
 		if (r->end >= nblks) {
 			nblks = r->end;
 			i++;
 			break;
 		}
 	}
 
 	/* Delete all found intersecting ranges, updates remaining. */
 	for (j = 0; i < ZFETCH_RANGES; i++, j++) {
 		if (zs->zs_ranges[i].start == 0)
 			break;
 		ASSERT3U(zs->zs_ranges[i].start, >, nblks);
 		ASSERT3U(zs->zs_ranges[i].end, >, nblks);
 		zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks;
 		zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks;
 	}
 	if (j < ZFETCH_RANGES) {
 		zs->zs_ranges[j].start = 0;
 		zs->zs_ranges[j].end = 0;
 	}
 
 done:
 	zs->zs_blkid += nblks;
 	return (nblks);
 }
 
 /*
  * Process future stream access for nblks blocks starting at blkid.  Return
  * number of blocks to proceed for if future ranges reach fill threshold.
  */
 static uint64_t
 dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks)
 {
 	ASSERT3U(blkid, >, zs->zs_blkid);
 	blkid -= zs->zs_blkid;
 	ASSERT3U(blkid + nblks, <=, UINT16_MAX);
 
 	/* Search for first and last intersection or insert point. */
 	uint_t f = ZFETCH_RANGES, l = 0, i;
 	for (i = 0; i < ZFETCH_RANGES; i++) {
 		zsrange_t *r = &zs->zs_ranges[i];
 		if (r->start == 0 || r->start > blkid + nblks)
 			break;
 		if (r->end < blkid)
 			continue;
 		if (f > i)
 			f = i;
 		if (l < i)
 			l = i;
 	}
 	if (f <= l) {
 		/* Got some intersecting range, expand it if needed. */
 		if (zs->zs_ranges[f].start > blkid)
 			zs->zs_ranges[f].start = blkid;
 		zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks);
 		if (f < l) {
 			/* Got more than one intersection, remove others. */
 			for (f++, l++; l < ZFETCH_RANGES; f++, l++) {
 				zs->zs_ranges[f].start = zs->zs_ranges[l].start;
 				zs->zs_ranges[f].end = zs->zs_ranges[l].end;
 			}
 			zs->zs_ranges[f].start = 0;
 			zs->zs_ranges[f].end = 0;
 		}
 	} else if (i < ZFETCH_RANGES) {
 		/* Got no intersecting ranges, insert new one. */
 		for (l = ZFETCH_RANGES - 1; l > i; l--) {
 			zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start;
 			zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end;
 		}
 		zs->zs_ranges[i].start = blkid;
 		zs->zs_ranges[i].end = blkid + nblks;
 	} else {
 		/* No space left to insert.  Drop the range. */
 		return (0);
 	}
 
 	/* Check if with the new access addition we reached fill threshold. */
 	if (zfetch_hole_shift >= 16)
 		return (0);
 	uint_t hole = 0;
 	for (i = f = l = 0; i < ZFETCH_RANGES; i++) {
 		zsrange_t *r = &zs->zs_ranges[i];
 		if (r->start == 0)
 			break;
 		hole += r->start - f;
 		f = r->end;
 		if (hole <= r->end >> zfetch_hole_shift)
 			l = r->end;
 	}
 	if (l > 0)
 		return (dmu_zfetch_hit(zs, l));
 
 	return (0);
 }
 
 /*
  * This is the predictive prefetch entry point.  dmu_zfetch_prepare()
  * associates dnode access specified with blkid and nblks arguments with
  * prefetch stream, predicts further accesses based on that stats and returns
  * the stream pointer on success.  That pointer must later be passed to
  * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
  * release it.  dmu_zfetch() is a wrapper for simple cases when window between
  * prediction and prefetch initiation is not needed.
  * fetch_data argument specifies whether actual data blocks should be fetched:
  *   FALSE -- prefetch only indirect blocks for predicted data blocks;
  *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
  */
 zstream_t *
 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks,
     boolean_t fetch_data, boolean_t have_lock)
 {
 	zstream_t *zs;
 	spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
 	zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch;
 	int64_t ipf_start, ipf_end;
 
 	if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE)
 		return (NULL);
 
 	if (os_prefetch == ZFS_PREFETCH_METADATA)
 		fetch_data = B_FALSE;
 
 	/*
 	 * If we haven't yet loaded the indirect vdevs' mappings, we
 	 * can only read from blocks that we carefully ensure are on
 	 * concrete vdevs (or previously-loaded indirect vdevs).  So we
 	 * can't allow the predictive prefetcher to attempt reads of other
 	 * blocks (e.g. of the MOS's dnode object).
 	 */
 	if (!spa_indirect_vdevs_loaded(spa))
 		return (NULL);
 
 	/*
 	 * As a fast path for small (single-block) files, ignore access
 	 * to the first block.
 	 */
 	if (!have_lock && blkid == 0)
 		return (NULL);
 
 	if (!have_lock)
 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
 
 	/*
 	 * A fast path for small files for which no prefetch will
 	 * happen.
 	 */
 	uint64_t maxblkid = zf->zf_dnode->dn_maxblkid;
 	if (maxblkid < 2) {
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
 		return (NULL);
 	}
 	mutex_enter(&zf->zf_lock);
 
 	/*
 	 * Find perfect prefetch stream.  Depending on whether the accesses
 	 * are block-aligned, first block of the new access may either follow
 	 * the last block of the previous access, or be equal to it.
 	 */
 	unsigned int dbs = zf->zf_dnode->dn_datablkshift;
 	uint64_t end_blkid = blkid + nblks;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid == zs->zs_blkid) {
 			goto hit;
 		} else if (blkid + 1 == zs->zs_blkid) {
 			blkid++;
 			nblks--;
 			goto hit;
 		}
 	}
 
 	/*
 	 * Find close enough prefetch stream.  Access crossing stream position
 	 * is a hit in its new part.  Access ahead of stream position considered
 	 * a hit for metadata prefetch, since we do not care about fill percent,
 	 * or stored for future otherwise.  Access behind stream position is
 	 * silently ignored, since we already skipped it reaching fill percent.
 	 */
 	uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX);
 	uint_t t = gethrestime_sec() - zfetch_max_sec_reap;
 	for (zs = list_head(&zf->zf_stream); zs != NULL;
 	    zs = list_next(&zf->zf_stream, zs)) {
 		if (blkid > zs->zs_blkid) {
 			if (end_blkid <= zs->zs_blkid + max_reorder) {
 				if (!fetch_data) {
 					nblks = dmu_zfetch_hit(zs,
 					    end_blkid - zs->zs_blkid);
 					ZFETCHSTAT_BUMP(zfetchstat_stride);
 					goto future;
 				}
 				nblks = dmu_zfetch_future(zs, blkid, nblks);
 				if (nblks > 0)
 					ZFETCHSTAT_BUMP(zfetchstat_stride);
 				else
 					ZFETCHSTAT_BUMP(zfetchstat_future);
 				goto future;
 			}
 		} else if (end_blkid >= zs->zs_blkid) {
 			nblks -= zs->zs_blkid - blkid;
 			blkid += zs->zs_blkid - blkid;
 			goto hit;
 		} else if (end_blkid + max_reorder > zs->zs_blkid &&
 		    (int)(zs->zs_atime - t) >= 0) {
 			ZFETCHSTAT_BUMP(zfetchstat_past);
 			zs->zs_atime = gethrestime_sec();
 			goto out;
 		}
 	}
 
 	/*
 	 * This access is not part of any existing stream.  Create a new
 	 * stream for it unless we are at the end of file.
 	 */
 	ASSERT0P(zs);
 	if (end_blkid < maxblkid)
 		dmu_zfetch_stream_create(zf, end_blkid);
 	mutex_exit(&zf->zf_lock);
 	ZFETCHSTAT_BUMP(zfetchstat_misses);
 	ipf_start = 0;
 	goto prescient;
 
 hit:
 	nblks = dmu_zfetch_hit(zs, nblks);
 	ZFETCHSTAT_BUMP(zfetchstat_hits);
 
 future:
 	zs->zs_atime = gethrestime_sec();
 
 	/* Exit if we already prefetched for this position before. */
 	if (nblks == 0)
 		goto out;
 
 	/* If the file is ending, remove the stream. */
 	end_blkid = zs->zs_blkid;
 	if (end_blkid >= maxblkid) {
 		dmu_zfetch_stream_remove(zf, zs);
 out:
 		mutex_exit(&zf->zf_lock);
 		if (!have_lock)
 			rw_exit(&zf->zf_dnode->dn_struct_rwlock);
 		return (NULL);
 	}
 
 	/*
 	 * This access was to a block that we issued a prefetch for on
 	 * behalf of this stream.  Calculate further prefetch distances.
 	 *
 	 * Start prefetch from the demand access size (nblks).  Double the
 	 * distance every access up to zfetch_min_distance.  After that only
 	 * if needed increase the distance by 1/8 up to zfetch_max_distance.
 	 *
 	 * Don't double the distance beyond single block if we have more
 	 * than ~6% of ARC held by active prefetches.  It should help with
 	 * getting out of RAM on some badly mispredicted read patterns.
 	 */
 	unsigned int nbytes = nblks << dbs;
 	unsigned int pf_nblks;
 	if (fetch_data) {
 		if (unlikely(zs->zs_pf_dist < nbytes))
 			zs->zs_pf_dist = nbytes;
 		else if (zs->zs_pf_dist < zfetch_min_distance &&
 		    (zs->zs_pf_dist < (1 << dbs) ||
 		    aggsum_compare(&zfetch_sums.zfetchstat_io_active,
 		    arc_c_max >> (4 + dbs)) < 0))
 			zs->zs_pf_dist *= 2;
 		else if (zs->zs_more)
 			zs->zs_pf_dist += zs->zs_pf_dist / 8;
 		zs->zs_more = B_FALSE;
 		if (zs->zs_pf_dist > zfetch_max_distance)
 			zs->zs_pf_dist = zfetch_max_distance;
 		pf_nblks = zs->zs_pf_dist >> dbs;
 	} else {
 		pf_nblks = 0;
 	}
 	if (zs->zs_pf_start < end_blkid)
 		zs->zs_pf_start = end_blkid;
 	if (zs->zs_pf_end < end_blkid + pf_nblks)
 		zs->zs_pf_end = end_blkid + pf_nblks;
 
 	/*
 	 * Do the same for indirects, starting where we will stop reading
 	 * data blocks (and the indirects that point to them).
 	 */
 	if (unlikely(zs->zs_ipf_dist < nbytes))
 		zs->zs_ipf_dist = nbytes;
 	else
 		zs->zs_ipf_dist *= 2;
 	if (zs->zs_ipf_dist > zfetch_max_idistance)
 		zs->zs_ipf_dist = zfetch_max_idistance;
 	pf_nblks = zs->zs_ipf_dist >> dbs;
 	if (zs->zs_ipf_start < zs->zs_pf_end)
 		zs->zs_ipf_start = zs->zs_pf_end;
 	ipf_start = zs->zs_ipf_end;
 	if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks)
 		zs->zs_ipf_end = zs->zs_pf_end + pf_nblks;
 
 	zfs_refcount_add(&zs->zs_refs, NULL);
 	/* Count concurrent callers. */
 	zfs_refcount_add(&zs->zs_callers, NULL);
 	mutex_exit(&zf->zf_lock);
 
 prescient:
 	/*
 	 * Prefetch the following indirect blocks for this access to reduce
 	 * dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode().
 	 * This covers the gap during the first couple accesses when we can
 	 * not predict the future yet, but know what is needed right now.
 	 * This should be very rare for reads/writes to need more than one
 	 * indirect, but more useful for cloning due to much bigger accesses.
 	 */
 	ipf_start = MAX(ipf_start, blkid + 1);
 	int epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
 	ipf_end = P2ROUNDUP(end_blkid, 1 << epbs) >> epbs;
 
 	int issued = 0;
 	for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
 		issued += dbuf_prefetch(zf->zf_dnode, 1, iblk,
 		    ZIO_PRIORITY_SYNC_READ, ARC_FLAG_PRESCIENT_PREFETCH);
 	}
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
 	if (issued)
 		ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
 	return (zs);
 }
 
 void
 dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed,
     boolean_t have_lock, boolean_t uncached)
 {
 	int64_t pf_start, pf_end, ipf_start, ipf_end;
 	int epbs, issued;
 
 	if (missed)
 		zs->zs_missed = missed;
 
 	/*
 	 * Postpone the prefetch if there are more concurrent callers.
 	 * It happens when multiple requests are waiting for the same
 	 * indirect block.  The last one will run the prefetch for all.
 	 */
 	if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) {
 		/* Drop reference taken in dmu_zfetch_prepare(). */
 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 			dmu_zfetch_stream_fini(zs);
 		return;
 	}
 
 	mutex_enter(&zf->zf_lock);
 	if (zs->zs_missed) {
 		pf_start = zs->zs_pf_start;
 		pf_end = zs->zs_pf_start = zs->zs_pf_end;
 	} else {
 		pf_start = pf_end = 0;
 	}
 	ipf_start = zs->zs_ipf_start;
 	ipf_end = zs->zs_ipf_start = zs->zs_ipf_end;
 	mutex_exit(&zf->zf_lock);
 	ASSERT3S(pf_start, <=, pf_end);
 	ASSERT3S(ipf_start, <=, ipf_end);
 
 	epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
 	ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs;
 	ASSERT3S(ipf_start, <=, ipf_end);
 	issued = pf_end - pf_start + ipf_end - ipf_start;
 	if (issued > 1) {
 		/* More references on top of taken in dmu_zfetch_prepare(). */
 		zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL);
 	} else if (issued == 0) {
 		/* Some other thread has done our work, so drop the ref. */
 		if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0)
 			dmu_zfetch_stream_fini(zs);
 		return;
 	}
 	aggsum_add(&zfetch_sums.zfetchstat_io_active, issued);
 
 	if (!have_lock)
 		rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER);
 
 	issued = 0;
 	for (int64_t blk = pf_start; blk < pf_end; blk++) {
 		issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk,
 		    ZIO_PRIORITY_ASYNC_READ, uncached ?
 		    ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs);
 	}
 	for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) {
 		issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk,
 		    ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs);
 	}
 
 	if (!have_lock)
 		rw_exit(&zf->zf_dnode->dn_struct_rwlock);
 
 	if (issued)
 		ZFETCHSTAT_ADD(zfetchstat_io_issued, issued);
 }
 
 void
 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data,
     boolean_t missed, boolean_t have_lock, boolean_t uncached)
 {
 	zstream_t *zs;
 
 	zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock);
 	if (zs)
 		dmu_zfetch_run(zf, zs, missed, have_lock, uncached);
 }
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW,
 	"Disable all ZFS prefetching");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW,
 	"Max number of streams per zfetch");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW,
 	"Min time before stream reclaim");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW,
 	"Max time before stream delete");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW,
 	"Min bytes to prefetch per stream");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW,
 	"Max bytes to prefetch per stream");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW,
 	"Max bytes to prefetch indirects for per stream");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW,
 	"Max request reorder distance within a stream");
 
 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW,
 	"Max log2 fraction of holes in a stream");
diff --git a/sys/contrib/openzfs/module/zfs/vdev.c b/sys/contrib/openzfs/module/zfs/vdev.c
index 9cf35e379000..ed04ce0c86eb 100644
--- a/sys/contrib/openzfs/module/zfs/vdev.c
+++ b/sys/contrib/openzfs/module/zfs/vdev.c
@@ -1,6681 +1,6681 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2021, Klara Inc.
  * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_draid.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/space_map.h>
 #include <sys/space_reftree.h>
 #include <sys/zio.h>
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/vdev_raidz.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_raidz.h>
 #include <sys/zvol.h>
 #include <sys/zfs_ratelimit.h>
 #include "zfs_prop.h"
 
 /*
  * One metaslab from each (normal-class) vdev is used by the ZIL.  These are
  * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
  * part of the spa_embedded_log_class.  The metaslab with the most free space
  * in each vdev is selected for this purpose when the pool is opened (or a
  * vdev is added).  See vdev_metaslab_init().
  *
  * Log blocks can be allocated from the following locations.  Each one is tried
  * in order until the allocation succeeds:
  * 1. dedicated log vdevs, aka "slog" (spa_log_class)
  * 2. embedded slog metaslabs (spa_embedded_log_class)
  * 3. other metaslabs in normal vdevs (spa_normal_class)
  *
  * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
  * than this number of metaslabs in the vdev.  This ensures that we don't set
  * aside an unreasonable amount of space for the ZIL.  If set to less than
  * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
  * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
  */
 static uint_t zfs_embedded_slog_min_ms = 64;
 
 /* default target for number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_default_ms_count = 200;
 
 /* minimum number of metaslabs per top-level vdev */
 static uint_t zfs_vdev_min_ms_count = 16;
 
 /* practical upper limit of total metaslabs per top-level vdev */
 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
 
 /* lower limit for metaslab size (512M) */
 static uint_t zfs_vdev_default_ms_shift = 29;
 
 /* upper limit for metaslab size (16G) */
 static uint_t zfs_vdev_max_ms_shift = 34;
 
-int vdev_validate_skip = B_FALSE;
+static int vdev_validate_skip = B_FALSE;
 
 /*
  * Since the DTL space map of a vdev is not expected to have a lot of
  * entries, we default its block size to 4K.
  */
 int zfs_vdev_dtl_sm_blksz = (1 << 12);
 
 /*
  * Rate limit slow IO (delay) events to this many per second.
  */
 static unsigned int zfs_slow_io_events_per_second = 20;
 
 /*
  * Rate limit deadman "hung IO" events to this many per second.
  */
 static unsigned int zfs_deadman_events_per_second = 1;
 
 /*
  * Rate limit direct write IO verify failures to this many per scond.
  */
 static unsigned int zfs_dio_write_verify_events_per_second = 20;
 
 /*
  * Rate limit checksum events after this many checksum errors per second.
  */
 static unsigned int zfs_checksum_events_per_second = 20;
 
 /*
  * Ignore errors during scrub/resilver.  Allows to work around resilver
  * upon import when there are pool errors.
  */
 static int zfs_scan_ignore_errors = 0;
 
 /*
  * vdev-wide space maps that have lots of entries written to them at
  * the end of each transaction can benefit from a higher I/O bandwidth
  * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
  */
 int zfs_vdev_standard_sm_blksz = (1 << 17);
 
 /*
  * Tunable parameter for debugging or performance analysis. Setting this
  * will cause pool corruption on power loss if a volatile out-of-order
  * write cache is enabled.
  */
 int zfs_nocacheflush = 0;
 
 /*
  * Maximum and minimum ashift values that can be automatically set based on
  * vdev's physical ashift (disk's physical sector size).  While ASHIFT_MAX
  * is higher than the maximum value, it is intentionally limited here to not
  * excessively impact pool space efficiency.  Higher ashift values may still
  * be forced by vdev logical ashift or by user via ashift property, but won't
  * be set automatically as a performance optimization.
  */
 uint_t zfs_vdev_max_auto_ashift = 14;
 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
 
 /*
  * VDEV checksum verification for Direct I/O writes. This is neccessary for
  * Linux, because anonymous pages can not be placed under write protection
  * during Direct I/O writes.
  */
 #if !defined(__FreeBSD__)
 uint_t zfs_vdev_direct_write_verify = 1;
 #else
 uint_t zfs_vdev_direct_write_verify = 0;
 #endif
 
 void
 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
 {
 	va_list adx;
 	char buf[256];
 
 	va_start(adx, fmt);
 	(void) vsnprintf(buf, sizeof (buf), fmt, adx);
 	va_end(adx);
 
 	if (vd->vdev_path != NULL) {
 		zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
 		    vd->vdev_path, buf);
 	} else {
 		zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
 		    vd->vdev_ops->vdev_op_type,
 		    (u_longlong_t)vd->vdev_id,
 		    (u_longlong_t)vd->vdev_guid, buf);
 	}
 }
 
 void
 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
 {
 	char state[20];
 
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
 		zfs_dbgmsg("%*svdev %llu: %s", indent, "",
 		    (u_longlong_t)vd->vdev_id,
 		    vd->vdev_ops->vdev_op_type);
 		return;
 	}
 
 	switch (vd->vdev_state) {
 	case VDEV_STATE_UNKNOWN:
 		(void) snprintf(state, sizeof (state), "unknown");
 		break;
 	case VDEV_STATE_CLOSED:
 		(void) snprintf(state, sizeof (state), "closed");
 		break;
 	case VDEV_STATE_OFFLINE:
 		(void) snprintf(state, sizeof (state), "offline");
 		break;
 	case VDEV_STATE_REMOVED:
 		(void) snprintf(state, sizeof (state), "removed");
 		break;
 	case VDEV_STATE_CANT_OPEN:
 		(void) snprintf(state, sizeof (state), "can't open");
 		break;
 	case VDEV_STATE_FAULTED:
 		(void) snprintf(state, sizeof (state), "faulted");
 		break;
 	case VDEV_STATE_DEGRADED:
 		(void) snprintf(state, sizeof (state), "degraded");
 		break;
 	case VDEV_STATE_HEALTHY:
 		(void) snprintf(state, sizeof (state), "healthy");
 		break;
 	default:
 		(void) snprintf(state, sizeof (state), "<state %u>",
 		    (uint_t)vd->vdev_state);
 	}
 
 	zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
 	    "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
 	    vd->vdev_islog ? " (log)" : "",
 	    (u_longlong_t)vd->vdev_guid,
 	    vd->vdev_path ? vd->vdev_path : "N/A", state);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++)
 		vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
 }
 
 char *
 vdev_rt_name(vdev_t *vd, const char *name)
 {
 	return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}",
 	    spa_name(vd->vdev_spa),
 	    (u_longlong_t)vd->vdev_guid,
 	    name));
 }
 
 static char *
 vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type)
 {
 	return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}",
 	    spa_name(vd->vdev_spa),
 	    (u_longlong_t)vd->vdev_guid,
 	    name,
 	    dtl_type));
 }
 
 /*
  * Virtual device management.
  */
 
 static vdev_ops_t *const vdev_ops_table[] = {
 	&vdev_root_ops,
 	&vdev_raidz_ops,
 	&vdev_draid_ops,
 	&vdev_draid_spare_ops,
 	&vdev_mirror_ops,
 	&vdev_replacing_ops,
 	&vdev_spare_ops,
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
 	&vdev_hole_ops,
 	&vdev_indirect_ops,
 	NULL
 };
 
 /*
  * Given a vdev type, return the appropriate ops vector.
  */
 static vdev_ops_t *
 vdev_getops(const char *type)
 {
 	vdev_ops_t *ops, *const *opspp;
 
 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
 		if (strcmp(ops->vdev_op_type, type) == 0)
 			break;
 
 	return (ops);
 }
 
 /*
  * Given a vdev and a metaslab class, find which metaslab group we're
  * interested in. All vdevs may belong to two different metaslab classes.
  * Dedicated slog devices use only the primary metaslab group, rather than a
  * separate log group.  For embedded slogs, vdev_log_mg will be non-NULL and
  * will point to a metaslab group of either embedded_log_class (for normal
  * vdevs) or special_embedded_log_class (for special vdevs).
  */
 metaslab_group_t *
 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
 {
 	if ((mc == spa_embedded_log_class(vd->vdev_spa) ||
 	    mc == spa_special_embedded_log_class(vd->vdev_spa)) &&
 	    vd->vdev_log_mg != NULL)
 		return (vd->vdev_log_mg);
 	else
 		return (vd->vdev_mg);
 }
 
 void
 vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	(void) vd, (void) remain_rs;
 
 	physical_rs->rs_start = logical_rs->rs_start;
 	physical_rs->rs_end = logical_rs->rs_end;
 }
 
 /*
  * Derive the enumerated allocation bias from string input.
  * String origin is either the per-vdev zap or zpool(8).
  */
 static vdev_alloc_bias_t
 vdev_derive_alloc_bias(const char *bias)
 {
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 
 	if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
 		alloc_bias = VDEV_BIAS_LOG;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
 		alloc_bias = VDEV_BIAS_SPECIAL;
 	else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
 		alloc_bias = VDEV_BIAS_DEDUP;
 
 	return (alloc_bias);
 }
 
 uint64_t
 vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
 {
 	ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift));
 	uint64_t csize, psize = asize;
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg);
 		psize = MIN(psize, csize);
 	}
 
 	return (psize);
 }
 
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
  */
 uint64_t
 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
 		asize = MAX(asize, csize);
 	}
 
 	return (asize);
 }
 
 uint64_t
 vdev_default_min_asize(vdev_t *vd)
 {
 	return (vd->vdev_min_asize);
 }
 
 /*
  * Get the minimum allocatable size. We define the allocatable size as
  * the vdev's asize rounded to the nearest metaslab. This allows us to
  * replace or attach devices which don't have the same physical size but
  * can still satisfy the same number of allocations.
  */
 uint64_t
 vdev_get_min_asize(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 
 	/*
 	 * If our parent is NULL (inactive spare or cache) or is the root,
 	 * just return our own asize.
 	 */
 	if (pvd == NULL)
 		return (vd->vdev_asize);
 
 	/*
 	 * The top-level vdev just returns the allocatable size rounded
 	 * to the nearest metaslab.
 	 */
 	if (vd == vd->vdev_top)
 		return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
 		    uint64_t));
 
 	return (pvd->vdev_ops->vdev_op_min_asize(pvd));
 }
 
 void
 vdev_set_min_asize(vdev_t *vd)
 {
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 /*
  * Get the minimal allocation size for the top-level vdev.
  */
 uint64_t
 vdev_get_min_alloc(vdev_t *vd)
 {
 	uint64_t min_alloc = 1ULL << vd->vdev_ashift;
 
 	if (vd->vdev_ops->vdev_op_min_alloc != NULL)
 		min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
 
 	return (min_alloc);
 }
 
 /*
  * Get the parity level for a top-level vdev.
  */
 uint64_t
 vdev_get_nparity(vdev_t *vd)
 {
 	uint64_t nparity = 0;
 
 	if (vd->vdev_ops->vdev_op_nparity != NULL)
 		nparity = vd->vdev_ops->vdev_op_nparity(vd);
 
 	return (nparity);
 }
 
 static int
 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t objid;
 	int err;
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (EINVAL);
 	}
 
 	err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
 	    sizeof (uint64_t), 1, value);
 
 	if (err == ENOENT)
 		*value = vdev_prop_default_numeric(prop);
 
 	return (err);
 }
 
 /*
  * Get the number of data disks for a top-level vdev.
  */
 uint64_t
 vdev_get_ndisks(vdev_t *vd)
 {
 	uint64_t ndisks = 1;
 
 	if (vd->vdev_ops->vdev_op_ndisks != NULL)
 		ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
 
 	return (ndisks);
 }
 
 vdev_t *
 vdev_lookup_top(spa_t *spa, uint64_t vdev)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev < rvd->vdev_children) {
 		ASSERT(rvd->vdev_child[vdev] != NULL);
 		return (rvd->vdev_child[vdev]);
 	}
 
 	return (NULL);
 }
 
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
 
 	return (NULL);
 }
 
 static int
 vdev_count_leaves_impl(vdev_t *vd)
 {
 	int n = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		return (1);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		n += vdev_count_leaves_impl(vd->vdev_child[c]);
 
 	return (n);
 }
 
 int
 vdev_count_leaves(spa_t *spa)
 {
 	int rc;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	rc = vdev_count_leaves_impl(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (rc);
 }
 
 void
 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
 {
 	size_t oldsize, newsize;
 	uint64_t id = cvd->vdev_id;
 	vdev_t **newchild;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT0P(cvd->vdev_parent);
 
 	cvd->vdev_parent = pvd;
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
 
 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
 	newsize = pvd->vdev_children * sizeof (vdev_t *);
 
 	newchild = kmem_alloc(newsize, KM_SLEEP);
 	if (pvd->vdev_child != NULL) {
 		memcpy(newchild, pvd->vdev_child, oldsize);
 		kmem_free(pvd->vdev_child, oldsize);
 	}
 
 	pvd->vdev_child = newchild;
 	pvd->vdev_child[id] = cvd;
 	pvd->vdev_nonrot &= cvd->vdev_nonrot;
 
 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
 	ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent);
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
 		cvd->vdev_spa->spa_leaf_list_gen++;
 	}
 }
 
 void
 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
 {
 	int c;
 	uint_t id = cvd->vdev_id;
 
 	ASSERT(cvd->vdev_parent == pvd);
 
 	if (pvd == NULL)
 		return;
 
 	ASSERT(id < pvd->vdev_children);
 	ASSERT(pvd->vdev_child[id] == cvd);
 
 	pvd->vdev_child[id] = NULL;
 	cvd->vdev_parent = NULL;
 
 	for (c = 0; c < pvd->vdev_children; c++)
 		if (pvd->vdev_child[c])
 			break;
 
 	if (c == pvd->vdev_children) {
 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
 		pvd->vdev_child = NULL;
 		pvd->vdev_children = 0;
 	}
 
 	if (cvd->vdev_ops->vdev_op_leaf) {
 		spa_t *spa = cvd->vdev_spa;
 		list_remove(&spa->spa_leaf_list, cvd);
 		spa->spa_leaf_list_gen++;
 	}
 
 	/*
 	 * Walk up all ancestors to update guid sum.
 	 */
 	for (; pvd != NULL; pvd = pvd->vdev_parent)
 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
 }
 
 /*
  * Remove any holes in the child array.
  */
 void
 vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
 	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (oldc == 0)
 		return;
 
 	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	if (newc > 0) {
 		newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
 		for (int c = newc = 0; c < oldc; c++) {
 			if ((cvd = pvd->vdev_child[c]) != NULL) {
 				newchild[newc] = cvd;
 				cvd->vdev_id = newc++;
 			}
 		}
 	} else {
 		newchild = NULL;
 	}
 
 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
 	pvd->vdev_child = newchild;
 	pvd->vdev_children = newc;
 }
 
 /*
  * Allocate and minimally initialize a vdev_t.
  */
 vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 
 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
 	vic = &vd->vdev_indirect_config;
 
 	if (spa->spa_root_vdev == NULL) {
 		ASSERT(ops == &vdev_root_ops);
 		spa->spa_root_vdev = vd;
 		spa->spa_load_guid = spa_generate_load_guid();
 	}
 
 	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
 			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
 			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
 
 	vd->vdev_spa = spa;
 	vd->vdev_id = id;
 	vd->vdev_guid = guid;
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_ishole = (ops == &vdev_hole_ops);
 	vic->vic_prev_indirect_vdev = UINT64_MAX;
 
 	rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
 	vd->vdev_obsolete_segments = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments"));
 
 	/*
 	 * Initialize rate limit structs for events.  We rate limit ZIO delay
 	 * and checksum events so that we don't overwhelm ZED with thousands
 	 * of events when a disk is acting up.
 	 */
 	zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
 	    1);
 	zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
 	    &zfs_dio_write_verify_events_per_second, 1);
 	zfs_ratelimit_init(&vd->vdev_checksum_rl,
 	    &zfs_checksum_events_per_second, 1);
 
 	/*
 	 * Default Thresholds for tuning ZED
 	 */
 	vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
 	vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
 	vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
 	vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
 	vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
 	vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
 
 	list_link_init(&vd->vdev_config_dirty_node);
 	list_link_init(&vd->vdev_state_dirty_node);
 	list_link_init(&vd->vdev_initialize_node);
 	list_link_init(&vd->vdev_leaf_node);
 	list_link_init(&vd->vdev_trim_node);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
 	mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
 
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = zfs_range_tree_create_flags(
 		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 		    ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t));
 	}
 
 	txg_list_create(&vd->vdev_ms_list, spa,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list, spa,
 	    offsetof(struct vdev, vdev_dtl_node));
 	vd->vdev_stat.vs_timestamp = gethrtime();
 	vdev_queue_init(vd);
 
 	return (vd);
 }
 
 /*
  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
  * creating a new vdev or loading an existing one - the behavior is slightly
  * different for each case.
  */
 int
 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
     int alloctype)
 {
 	vdev_ops_t *ops;
 	const char *type;
 	uint64_t guid = 0, islog;
 	vdev_t *vd;
 	vdev_indirect_config_t *vic;
 	const char *tmp = NULL;
 	int rc;
 	vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
 	boolean_t top_level = (parent && !parent->vdev_parent);
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((ops = vdev_getops(type)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * If this is a load, get the vdev guid from the nvlist.
 	 * Otherwise, vdev_alloc_common() will generate one for us.
 	 */
 	if (alloctype == VDEV_ALLOC_LOAD) {
 		uint64_t label_id;
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
 		    label_id != id)
 			return (SET_ERROR(EINVAL));
 
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_SPARE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * The first allocated vdev must be of type 'root'.
 	 */
 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Determine whether we're a log vdev.
 	 */
 	islog = 0;
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (SET_ERROR(ENOTSUP));
 
 	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
 		return (SET_ERROR(ENOTSUP));
 
 	if (top_level && alloctype == VDEV_ALLOC_ADD) {
 		const char *bias;
 
 		/*
 		 * If creating a top-level vdev, check for allocation
 		 * classes input.
 		 */
 		if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
 		    &bias) == 0) {
 			alloc_bias = vdev_derive_alloc_bias(bias);
 
 			/* spa_vdev_add() expects feature to be enabled */
 			if (spa->spa_load_state != SPA_LOAD_CREATE &&
 			    !spa_feature_is_enabled(spa,
 			    SPA_FEATURE_ALLOCATION_CLASSES)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 		}
 
 		/* spa_vdev_add() expects feature to be enabled */
 		if (ops == &vdev_draid_ops &&
 		    spa->spa_load_state != SPA_LOAD_CREATE &&
 		    !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
 			return (SET_ERROR(ENOTSUP));
 		}
 	}
 
 	/*
 	 * Initialize the vdev specific data.  This is done before calling
 	 * vdev_alloc_common() since it may fail and this simplifies the
 	 * error reporting and cleanup code paths.
 	 */
 	void *tsd = NULL;
 	if (ops->vdev_op_init != NULL) {
 		rc = ops->vdev_op_init(spa, nv, &tsd);
 		if (rc != 0) {
 			return (rc);
 		}
 	}
 
 	vd = vdev_alloc_common(spa, id, guid, ops);
 	vd->vdev_tsd = tsd;
 	vd->vdev_islog = islog;
 
 	if (top_level && alloc_bias != VDEV_BIAS_NONE)
 		vd->vdev_alloc_bias = alloc_bias;
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
 		vd->vdev_path = spa_strdup(tmp);
 
 	/*
 	 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
 	 * fault on a vdev and want it to persist across imports (like with
 	 * zpool offline -f).
 	 */
 	rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
 	if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_faulted = 1;
 		vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 	}
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
 		vd->vdev_devid = spa_strdup(tmp);
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
 		vd->vdev_physpath = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
 	    &tmp) == 0)
 		vd->vdev_enc_sysfs_path = spa_strdup(tmp);
 
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
 		vd->vdev_fru = spa_strdup(tmp);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
 	 * as -1.
 	 */
 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 	    &vd->vdev_wholedisk) != 0)
 		vd->vdev_wholedisk = -1ULL;
 
 	vic = &vd->vdev_indirect_config;
 
 	ASSERT0(vic->vic_mapping_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
 	    &vic->vic_mapping_object);
 	ASSERT0(vic->vic_births_object);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
 	    &vic->vic_births_object);
 	ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
 	    &vic->vic_prev_indirect_vdev);
 
 	/*
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
 	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement. Ignore pool ashift for vdev
 	 * attach case.
 	 */
 	if (alloctype != VDEV_ALLOC_ATTACH) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
 		    &vd->vdev_ashift);
 	} else {
 		vd->vdev_attaching = B_TRUE;
 	}
 
 	/*
 	 * Retrieve the vdev creation time.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
 	    &vd->vdev_crtxg);
 
 	if (vd->vdev_ops == &vdev_root_ops &&
 	    (alloctype == VDEV_ALLOC_LOAD ||
 	    alloctype == VDEV_ALLOC_SPLIT ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
 		    &vd->vdev_root_zap);
 	}
 
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
 	if (top_level &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    &vd->vdev_ms_shift);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    &vd->vdev_asize);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
 		    &vd->vdev_noalloc);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
 		    &vd->vdev_removing);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    &vd->vdev_top_zap);
 		vd->vdev_rz_expanding = nvlist_exists(nv,
 		    ZPOOL_CONFIG_RAIDZ_EXPANDING);
 	} else {
 		ASSERT0(vd->vdev_top_zap);
 	}
 
 	if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
 		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
 		    alloctype == VDEV_ALLOC_ADD ||
 		    alloctype == VDEV_ALLOC_SPLIT ||
 		    alloctype == VDEV_ALLOC_ROOTPOOL);
 		/* Note: metaslab_group_create() is now deferred */
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv,
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
 	} else {
 		ASSERT0(vd->vdev_leaf_zap);
 	}
 
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
 			    &vd->vdev_dtl_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
 
 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
 			uint64_t spare = 0;
 
 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
 			    &spare) == 0 && spare)
 				spa_spare_add(vd);
 		}
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
 		    &vd->vdev_rebuild_txg);
 
 		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
 			vdev_defer_resilver(vd);
 
 		/*
 		 * In general, when importing a pool we want to ignore the
 		 * persistent fault state, as the diagnosis made on another
 		 * system may not be valid in the current context.  The only
 		 * exception is if we forced a vdev to a persistently faulted
 		 * state with 'zpool offline -f'.  The persistent fault will
 		 * remain across imports until cleared.
 		 *
 		 * Local vdevs will remain in the faulted state.
 		 */
 		if (spa_load_state(spa) == SPA_LOAD_OPEN ||
 		    spa_load_state(spa) == SPA_LOAD_IMPORT) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
 
 			if (vd->vdev_faulted || vd->vdev_degraded) {
 				const char *aux;
 
 				vd->vdev_label_aux =
 				    VDEV_AUX_ERR_EXCEEDED;
 				if (nvlist_lookup_string(nv,
 				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
 				    strcmp(aux, "external") == 0)
 					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
 				else
 					vd->vdev_faulted = 0ULL;
 			}
 		}
 	}
 
 	/*
 	 * Add ourselves to the parent's list of children.
 	 */
 	vdev_add_child(parent, vd);
 
 	*vdp = vd;
 
 	return (0);
 }
 
 void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT0P(vd->vdev_initialize_thread);
 	ASSERT0P(vd->vdev_trim_thread);
 	ASSERT0P(vd->vdev_autotrim_thread);
 	ASSERT0P(vd->vdev_rebuild_thread);
 
 	/*
 	 * Scan queues are normally destroyed at the end of a scan. If the
 	 * queue exists here, that implies the vdev is being removed while
 	 * the scan is still running.
 	 */
 	if (vd->vdev_scan_io_queue != NULL) {
 		mutex_enter(&vd->vdev_scan_io_queue_lock);
 		dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
 		vd->vdev_scan_io_queue = NULL;
 		mutex_exit(&vd->vdev_scan_io_queue_lock);
 	}
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
 	 * trying to ensure complicated semantics for all callers.
 	 */
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT0P(vd->vdev_child);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
 
 	if (vd->vdev_ops->vdev_op_fini != NULL)
 		vd->vdev_ops->vdev_op_fini(vd);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Remove this vdev from its parent's child list.
 	 */
 	vdev_remove_child(vd->vdev_parent, vd);
 
 	ASSERT0P(vd->vdev_parent);
 	ASSERT(!list_link_active(&vd->vdev_leaf_node));
 
 	/*
 	 * Clean up vdev structure.
 	 */
 	vdev_queue_fini(vd);
 
 	if (vd->vdev_path)
 		spa_strfree(vd->vdev_path);
 	if (vd->vdev_devid)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
 
 	if (vd->vdev_enc_sysfs_path)
 		spa_strfree(vd->vdev_enc_sysfs_path);
 
 	if (vd->vdev_fru)
 		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
 	if (vd->vdev_isl2cache)
 		spa_l2cache_remove(vd);
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	space_map_close(vd->vdev_dtl_sm);
 	for (int t = 0; t < DTL_TYPES; t++) {
 		zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
 		zfs_range_tree_destroy(vd->vdev_dtl[t]);
 	}
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	EQUIV(vd->vdev_indirect_births != NULL,
 	    vd->vdev_indirect_mapping != NULL);
 	if (vd->vdev_indirect_births != NULL) {
 		vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 		vdev_indirect_births_close(vd->vdev_indirect_births);
 	}
 
 	if (vd->vdev_obsolete_sm != NULL) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 	}
 	zfs_range_tree_destroy(vd->vdev_obsolete_segments);
 	rw_destroy(&vd->vdev_indirect_rwlock);
 	mutex_destroy(&vd->vdev_obsolete_lock);
 
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
 	mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
 	mutex_destroy(&vd->vdev_initialize_lock);
 	mutex_destroy(&vd->vdev_initialize_io_lock);
 	cv_destroy(&vd->vdev_initialize_io_cv);
 	cv_destroy(&vd->vdev_initialize_cv);
 
 	mutex_destroy(&vd->vdev_trim_lock);
 	mutex_destroy(&vd->vdev_autotrim_lock);
 	mutex_destroy(&vd->vdev_trim_io_lock);
 	cv_destroy(&vd->vdev_trim_cv);
 	cv_destroy(&vd->vdev_autotrim_cv);
 	cv_destroy(&vd->vdev_autotrim_kick_cv);
 	cv_destroy(&vd->vdev_trim_io_cv);
 
 	mutex_destroy(&vd->vdev_rebuild_lock);
 	cv_destroy(&vd->vdev_rebuild_cv);
 
 	zfs_ratelimit_fini(&vd->vdev_delay_rl);
 	zfs_ratelimit_fini(&vd->vdev_deadman_rl);
 	zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
 	zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
 
 	kmem_free(vd, sizeof (vdev_t));
 }
 
 /*
  * Transfer top-level vdev state from svd to tvd.
  */
 static void
 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 {
 	spa_t *spa = svd->vdev_spa;
 	metaslab_t *msp;
 	vdev_t *vd;
 	int t;
 
 	ASSERT(tvd == tvd->vdev_top);
 
 	tvd->vdev_ms_array = svd->vdev_ms_array;
 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
 	tvd->vdev_ms_count = svd->vdev_ms_count;
 	tvd->vdev_top_zap = svd->vdev_top_zap;
 
 	svd->vdev_ms_array = 0;
 	svd->vdev_ms_shift = 0;
 	svd->vdev_ms_count = 0;
 	svd->vdev_top_zap = 0;
 
 	if (tvd->vdev_mg)
 		ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
 	if (tvd->vdev_log_mg)
 		ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
 	tvd->vdev_mg = svd->vdev_mg;
 	tvd->vdev_log_mg = svd->vdev_log_mg;
 	tvd->vdev_ms = svd->vdev_ms;
 
 	svd->vdev_mg = NULL;
 	svd->vdev_log_mg = NULL;
 	svd->vdev_ms = NULL;
 
 	if (tvd->vdev_mg != NULL)
 		tvd->vdev_mg->mg_vd = tvd;
 	if (tvd->vdev_log_mg != NULL)
 		tvd->vdev_log_mg->mg_vd = tvd;
 
 	tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
 	svd->vdev_checkpoint_sm = NULL;
 
 	tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
 	svd->vdev_alloc_bias = VDEV_BIAS_NONE;
 
 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
 
 	svd->vdev_stat.vs_alloc = 0;
 	svd->vdev_stat.vs_space = 0;
 	svd->vdev_stat.vs_dspace = 0;
 
 	/*
 	 * State which may be set on a top-level vdev that's in the
 	 * process of being removed.
 	 */
 	ASSERT0(tvd->vdev_indirect_config.vic_births_object);
 	ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
 	ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
 	ASSERT0P(tvd->vdev_indirect_mapping);
 	ASSERT0P(tvd->vdev_indirect_births);
 	ASSERT0P(tvd->vdev_obsolete_sm);
 	ASSERT0(tvd->vdev_noalloc);
 	ASSERT0(tvd->vdev_removing);
 	ASSERT0(tvd->vdev_rebuilding);
 	tvd->vdev_noalloc = svd->vdev_noalloc;
 	tvd->vdev_removing = svd->vdev_removing;
 	tvd->vdev_rebuilding = svd->vdev_rebuilding;
 	tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
 	tvd->vdev_indirect_config = svd->vdev_indirect_config;
 	tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
 	tvd->vdev_indirect_births = svd->vdev_indirect_births;
 	zfs_range_tree_swap(&svd->vdev_obsolete_segments,
 	    &tvd->vdev_obsolete_segments);
 	tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
 	svd->vdev_indirect_config.vic_mapping_object = 0;
 	svd->vdev_indirect_config.vic_births_object = 0;
 	svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
 	svd->vdev_indirect_mapping = NULL;
 	svd->vdev_indirect_births = NULL;
 	svd->vdev_obsolete_sm = NULL;
 	svd->vdev_noalloc = 0;
 	svd->vdev_removing = 0;
 	svd->vdev_rebuilding = 0;
 
 	for (t = 0; t < TXG_SIZE; t++) {
 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
 	}
 
 	if (list_link_active(&svd->vdev_config_dirty_node)) {
 		vdev_config_clean(svd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (list_link_active(&svd->vdev_state_dirty_node)) {
 		vdev_state_clean(svd);
 		vdev_state_dirty(tvd);
 	}
 
 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
 	svd->vdev_deflate_ratio = 0;
 
 	tvd->vdev_islog = svd->vdev_islog;
 	svd->vdev_islog = 0;
 
 	dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
 /*
  * Add a mirror/replacing vdev above an existing vdev.  There is no need to
  * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
  */
 vdev_t *
 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 {
 	spa_t *spa = cvd->vdev_spa;
 	vdev_t *pvd = cvd->vdev_parent;
 	vdev_t *mvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
 	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_max_asize = cvd->vdev_max_asize;
 	mvd->vdev_psize = cvd->vdev_psize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
 	mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
 	mvd->vdev_state = cvd->vdev_state;
 	mvd->vdev_crtxg = cvd->vdev_crtxg;
 	mvd->vdev_nonrot = cvd->vdev_nonrot;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
 	cvd->vdev_id = mvd->vdev_children;
 	vdev_add_child(mvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (mvd == mvd->vdev_top)
 		vdev_top_transfer(cvd, mvd);
 
 	return (mvd);
 }
 
 /*
  * Remove a 1-way mirror/replacing vdev from the tree.
  */
 void
 vdev_remove_parent(vdev_t *cvd)
 {
 	vdev_t *mvd = cvd->vdev_parent;
 	vdev_t *pvd = mvd->vdev_parent;
 
 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	ASSERT(mvd->vdev_children == 1);
 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
 	    mvd->vdev_ops == &vdev_replacing_ops ||
 	    mvd->vdev_ops == &vdev_spare_ops);
 	cvd->vdev_ashift = mvd->vdev_ashift;
 	cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
 	cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
 
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
 	if (mvd->vdev_top == mvd) {
 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
 		cvd->vdev_orig_guid = cvd->vdev_guid;
 		cvd->vdev_guid += guid_delta;
 		cvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If pool not set for autoexpand, we need to also preserve
 		 * mvd's asize to prevent automatic expansion of cvd.
 		 * Otherwise if we are adjusting the mirror by attaching and
 		 * detaching children of non-uniform sizes, the mirror could
 		 * autoexpand, unexpectedly requiring larger devices to
 		 * re-establish the mirror.
 		 */
 		if (!cvd->vdev_spa->spa_autoexpand)
 			cvd->vdev_asize = mvd->vdev_asize;
 	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
 
 	if (cvd == cvd->vdev_top)
 		vdev_top_transfer(mvd, cvd);
 
 	ASSERT0(mvd->vdev_children);
 	vdev_free(mvd);
 }
 
 /*
  * Choose GCD for spa_gcd_alloc.
  */
 static uint64_t
 vdev_gcd(uint64_t a, uint64_t b)
 {
 	while (b != 0) {
 		uint64_t t = b;
 		b = a % b;
 		a = t;
 	}
 	return (a);
 }
 
 /*
  * Set spa_min_alloc and spa_gcd_alloc.
  */
 static void
 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
 {
 	if (min_alloc < spa->spa_min_alloc)
 		spa->spa_min_alloc = min_alloc;
 	if (spa->spa_gcd_alloc == INT_MAX) {
 		spa->spa_gcd_alloc = min_alloc;
 	} else {
 		spa->spa_gcd_alloc = vdev_gcd(min_alloc,
 		    spa->spa_gcd_alloc);
 	}
 }
 
 void
 vdev_metaslab_group_create(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	/*
 	 * metaslab_group_create was delayed until allocation bias was available
 	 */
 	if (vd->vdev_mg == NULL) {
 		metaslab_class_t *mc;
 
 		if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			vd->vdev_alloc_bias = VDEV_BIAS_LOG;
 
 		ASSERT3U(vd->vdev_islog, ==,
 		    (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
 
 		switch (vd->vdev_alloc_bias) {
 		case VDEV_BIAS_LOG:
 			mc = spa_log_class(spa);
 			break;
 		case VDEV_BIAS_SPECIAL:
 			mc = spa_special_class(spa);
 			break;
 		case VDEV_BIAS_DEDUP:
 			mc = spa_dedup_class(spa);
 			break;
 		default:
 			mc = spa_normal_class(spa);
 		}
 
 		vd->vdev_mg = metaslab_group_create(mc, vd);
 
 		if (!vd->vdev_islog) {
 			if (mc == spa_special_class(spa)) {
 				vd->vdev_log_mg = metaslab_group_create(
 				    spa_special_embedded_log_class(spa), vd);
 			} else {
 				vd->vdev_log_mg = metaslab_group_create(
 				    spa_embedded_log_class(spa), vd);
 			}
 		}
 
 		/*
 		 * The spa ashift min/max only apply for the normal metaslab
 		 * class. Class destination is late binding so ashift boundary
 		 * setting had to wait until now.
 		 */
 		if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 		    mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
 			if (vd->vdev_ashift > spa->spa_max_ashift)
 				spa->spa_max_ashift = vd->vdev_ashift;
 			if (vd->vdev_ashift < spa->spa_min_ashift)
 				spa->spa_min_ashift = vd->vdev_ashift;
 
 			uint64_t min_alloc = vdev_get_min_alloc(vd);
 			vdev_spa_set_alloc(spa, min_alloc);
 		}
 	}
 }
 
 void
 vdev_update_nonallocating_space(vdev_t *vd, boolean_t add)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd->vdev_mg->mg_class != spa_normal_class(spa))
 		return;
 
 	uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg);
 	uint64_t dspace = spa_deflate(spa) ?
 	    vdev_deflated_space(vd, raw_space) : raw_space;
 	if (add) {
 		spa->spa_nonallocating_dspace += dspace;
 	} else {
 		ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace);
 		spa->spa_nonallocating_dspace -= dspace;
 	}
 }
 
 int
 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 	boolean_t expanding = (oldc != 0);
 
 	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	/*
 	 * This vdev is not being allocated from yet or is a hole.
 	 */
 	if (vd->vdev_ms_shift == 0)
 		return (0);
 
 	ASSERT(!vd->vdev_ishole);
 
 	ASSERT(oldc <= newc);
 
 	mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
 	if (expanding) {
 		memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
 		vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
 	}
 
 	vd->vdev_ms = mspp;
 	vd->vdev_ms_count = newc;
 
 	/*
 	 * Weighting algorithms can depend on the number of metaslabs in the
 	 * vdev. In order to ensure that all weights are correct at all times,
 	 * we need to recalculate here.
 	 */
 	for (uint64_t m = 0; m < oldc; m++) {
 		metaslab_t *msp = vd->vdev_ms[m];
 		mutex_enter(&msp->ms_lock);
 		metaslab_recalculate_weight_and_sort(msp);
 		mutex_exit(&msp->ms_lock);
 	}
 
 	for (uint64_t m = oldc; m < newc; m++) {
 		uint64_t object = 0;
 		/*
 		 * vdev_ms_array may be 0 if we are creating the "fake"
 		 * metaslabs for an indirect vdev for zdb's leak detection.
 		 * See zdb_leak_init().
 		 */
 		if (txg == 0 && vd->vdev_ms_array != 0) {
 			error = dmu_read(spa->spa_meta_objset,
 			    vd->vdev_ms_array,
 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
 			    DMU_READ_PREFETCH);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "unable to read the metaslab "
 				    "array [error=%d]", error);
 				return (error);
 			}
 		}
 
 		error = metaslab_init(vd->vdev_mg, m, object, txg,
 		    &(vd->vdev_ms[m]));
 		if (error != 0) {
 			vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
 			    error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Find the emptiest metaslab on the vdev and mark it for use for
 	 * embedded slog by moving it from the regular to the log metaslab
 	 * group.  This works for normal and special vdevs.
 	 */
 	if ((vd->vdev_mg->mg_class == spa_normal_class(spa) ||
 	    vd->vdev_mg->mg_class == spa_special_class(spa)) &&
 	    vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
 	    avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
 		uint64_t slog_msid = 0;
 		uint64_t smallest = UINT64_MAX;
 
 		/*
 		 * Note, we only search the new metaslabs, because the old
 		 * (pre-existing) ones may be active (e.g. have non-empty
 		 * range_tree's), and we don't move them to the new
 		 * metaslab_t.
 		 */
 		for (uint64_t m = oldc; m < newc; m++) {
 			uint64_t alloc =
 			    space_map_allocated(vd->vdev_ms[m]->ms_sm);
 			if (alloc < smallest) {
 				slog_msid = m;
 				smallest = alloc;
 			}
 		}
 		metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
 		/*
 		 * The metaslab was marked as dirty at the end of
 		 * metaslab_init(). Remove it from the dirty list so that we
 		 * can uninitialize and reinitialize it to the new class.
 		 */
 		if (txg != 0) {
 			(void) txg_list_remove_this(&vd->vdev_ms_list,
 			    slog_ms, txg);
 		}
 		uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
 		metaslab_fini(slog_ms);
 		VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
 		    &vd->vdev_ms[slog_msid]));
 	}
 
 	if (txg == 0)
 		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
 
 	/*
 	 * If the vdev is marked as non-allocating then don't
 	 * activate the metaslabs since we want to ensure that
 	 * no allocations are performed on this device.
 	 */
 	if (vd->vdev_noalloc) {
 		/* track non-allocating vdev space */
 		vdev_update_nonallocating_space(vd, B_TRUE);
 	} else if (!expanding) {
 		metaslab_group_activate(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 	}
 
 	if (txg == 0)
 		spa_config_exit(spa, SCL_ALLOC, FTAG);
 
 	return (0);
 }
 
 void
 vdev_metaslab_fini(vdev_t *vd)
 {
 	if (vd->vdev_checkpoint_sm != NULL) {
 		ASSERT(spa_feature_is_active(vd->vdev_spa,
 		    SPA_FEATURE_POOL_CHECKPOINT));
 		space_map_close(vd->vdev_checkpoint_sm);
 		/*
 		 * Even though we close the space map, we need to set its
 		 * pointer to NULL. The reason is that vdev_metaslab_fini()
 		 * may be called multiple times for certain operations
 		 * (i.e. when destroying a pool) so we need to ensure that
 		 * this clause never executes twice. This logic is similar
 		 * to the one used for the vdev_ms clause below.
 		 */
 		vd->vdev_checkpoint_sm = NULL;
 	}
 
 	if (vd->vdev_ms != NULL) {
 		metaslab_group_t *mg = vd->vdev_mg;
 
 		metaslab_group_passivate(mg);
 		if (vd->vdev_log_mg != NULL) {
 			ASSERT(!vd->vdev_islog);
 			metaslab_group_passivate(vd->vdev_log_mg);
 		}
 
 		uint64_t count = vd->vdev_ms_count;
 		for (uint64_t m = 0; m < count; m++) {
 			metaslab_t *msp = vd->vdev_ms[m];
 			if (msp != NULL)
 				metaslab_fini(msp);
 		}
 		vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
 		vd->vdev_ms = NULL;
 		vd->vdev_ms_count = 0;
 
 		for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
 			ASSERT0(mg->mg_histogram[i]);
 			if (vd->vdev_log_mg != NULL)
 				ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
 		}
 	}
 	ASSERT0(vd->vdev_ms_count);
 }
 
 typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	boolean_t	vps_zio_done_probe;
 	int		vps_flags;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
 
 	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
 		if (zio->io_error == 0 && spa_writeable(spa)) {
 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_abd,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
 		} else {
 			abd_free(zio->io_abd);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		abd_free(zio->io_abd);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
 		zio_t *pio;
 		zio_link_t *zl;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 		vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
 		    vd->vdev_cant_read, vd->vdev_cant_write);
 
 		if (vdev_readable(vd) &&
 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			vdev_dbgmsg(vd, "failed probe");
 			(void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
 			    spa, vd, NULL, NULL, 0);
 			zio->io_error = SET_ERROR(ENXIO);
 
 			/*
 			 * If this probe was initiated from zio pipeline, then
 			 * change the state in a spa_async_request. Probes that
 			 * were initiated from a vdev_open can change the state
 			 * as part of the open call.
 			 * Skip fault injection if this vdev is already removed
 			 * or a removal is pending.
 			 */
 			if (vps->vps_zio_done_probe &&
 			    !vd->vdev_remove_wanted && !vd->vdev_removed) {
 				vd->vdev_fault_wanted = B_TRUE;
 				spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
 			}
 		}
 
 		mutex_enter(&vd->vdev_probe_lock);
 		ASSERT(vd->vdev_probe_zio == zio);
 		vd->vdev_probe_zio = NULL;
 		mutex_exit(&vd->vdev_probe_lock);
 
 		zl = NULL;
 		while ((pio = zio_walk_parents(zio, &zl)) != NULL)
 			if (!vdev_accessible(vd, pio))
 				pio->io_error = SET_ERROR(ENXIO);
 
 		kmem_free(vps, sizeof (*vps));
 	}
 }
 
 /*
  * Determine whether this device is accessible.
  *
  * Read and write to several known locations: the pad regions of each
  * vdev label but the first, which we leave alone in case it contains
  * a VTOC.
  */
 zio_t *
 vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_probe_stats_t *vps = NULL;
 	zio_t *pio;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	/*
 	 * Don't probe the probe.
 	 */
 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
 		return (NULL);
 
 	/*
 	 * To prevent 'probe storms' when a device fails, we create
 	 * just one probe i/o at a time.  All zios that want to probe
 	 * this vdev will become parents of the probe io.
 	 */
 	mutex_enter(&vd->vdev_probe_lock);
 
 	if ((pio = vd->vdev_probe_zio) == NULL) {
 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
 
 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
 		    ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
 		vps->vps_zio_done_probe = (zio != NULL);
 
 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
 			/*
 			 * vdev_cant_read and vdev_cant_write can only
 			 * transition from TRUE to FALSE when we have the
 			 * SCL_ZIO lock as writer; otherwise they can only
 			 * transition from FALSE to TRUE.  This ensures that
 			 * any zio looking at these values can assume that
 			 * failures persist for the life of the I/O.  That's
 			 * important because when a device has intermittent
 			 * connectivity problems, we want to ensure that
 			 * they're ascribed to the device (ENXIO) and not
 			 * the zio (EIO).
 			 *
 			 * Since we hold SCL_ZIO as writer here, clear both
 			 * values so the probe can reevaluate from first
 			 * principles.
 			 */
 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
 			vd->vdev_cant_read = B_FALSE;
 			vd->vdev_cant_write = B_FALSE;
 		}
 
 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
 		    vdev_probe_done, vps,
 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
 	}
 
 	if (zio != NULL)
 		zio_add_child(zio, pio);
 
 	mutex_exit(&vd->vdev_probe_lock);
 
 	if (vps == NULL) {
 		ASSERT(zio != NULL);
 		return (NULL);
 	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
 		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
 		    offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
 		    abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
 	if (zio == NULL)
 		return (pio);
 
 	zio_nowait(pio);
 	return (NULL);
 }
 
 static void
 vdev_load_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_load_error = vdev_load(vd);
 }
 
 static void
 vdev_open_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_open_thread = curthread;
 	vd->vdev_open_error = vdev_open(vd);
 	vd->vdev_open_thread = NULL;
 }
 
 static boolean_t
 vdev_uses_zvols(vdev_t *vd)
 {
 #ifdef _KERNEL
 	if (zvol_is_zvol(vd->vdev_path))
 		return (B_TRUE);
 #endif
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_uses_zvols(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Returns B_TRUE if the passed child should be opened.
  */
 static boolean_t
 vdev_default_open_children_func(vdev_t *vd)
 {
 	(void) vd;
 	return (B_TRUE);
 }
 
 /*
  * Open the requested child vdevs.  If any of the leaf vdevs are using
  * a ZFS volume then do the opens in a single thread.  This avoids a
  * deadlock when the current thread is holding the spa_namespace_lock.
  */
 static void
 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	int children = vd->vdev_children;
 
 	taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
 	    children, children, TASKQ_PREPOPULATE);
 	vd->vdev_nonrot = B_TRUE;
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (open_func(cvd) == B_FALSE)
 			continue;
 
 		if (tq == NULL || vdev_uses_zvols(vd)) {
 			cvd->vdev_open_error = vdev_open(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_open_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL)
 		taskq_wait(tq);
 	for (int c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (open_func(cvd) == B_FALSE ||
 		    cvd->vdev_state <= VDEV_STATE_FAULTED)
 			continue;
 		vd->vdev_nonrot &= cvd->vdev_nonrot;
 	}
 
 	if (tq != NULL)
 		taskq_destroy(tq);
 }
 
 /*
  * Open all child vdevs.
  */
 void
 vdev_open_children(vdev_t *vd)
 {
 	vdev_open_children_impl(vd, vdev_default_open_children_func);
 }
 
 /*
  * Conditionally open a subset of child vdevs.
  */
 void
 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
 {
 	vdev_open_children_impl(vd, open_func);
 }
 
 /*
  * Compute the raidz-deflation ratio.  Note, we hard-code 128k (1 << 17)
  * because it is the "typical" blocksize.  Even though SPA_MAXBLOCKSIZE
  * changed, this algorithm can not change, otherwise it would inconsistently
  * account for existing bp's.  We also hard-code txg 0 for the same reason
  * since expanded RAIDZ vdevs can use a different asize for different birth
  * txg's.
  */
 static void
 vdev_set_deflate_ratio(vdev_t *vd)
 {
 	if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
 		vd->vdev_deflate_ratio = (1 << 17) /
 		    (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
 		    SPA_MINBLOCKSHIFT);
 	}
 }
 
 /*
  * Choose the best of two ashifts, preferring one between logical ashift
  * (absolute minimum) and administrator defined maximum, otherwise take
  * the biggest of the two.
  */
 uint64_t
 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
 {
 	if (a > logical && a <= zfs_vdev_max_auto_ashift) {
 		if (b <= logical || b > zfs_vdev_max_auto_ashift)
 			return (a);
 		else
 			return (MAX(a, b));
 	} else if (b <= logical || b > zfs_vdev_max_auto_ashift)
 		return (MAX(a, b));
 	return (b);
 }
 
 /*
  * Maximize performance by inflating the configured ashift for top level
  * vdevs to be as close to the physical ashift as possible while maintaining
  * administrator defined limits and ensuring it doesn't go below the
  * logical ashift.
  */
 static void
 vdev_ashift_optimize(vdev_t *vd)
 {
 	ASSERT(vd == vd->vdev_top);
 
 	if (vd->vdev_ashift < vd->vdev_physical_ashift &&
 	    vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
 		vd->vdev_ashift = MIN(
 		    MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
 		    MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_physical_ashift));
 	} else {
 		/*
 		 * If the logical and physical ashifts are the same, then
 		 * we ensure that the top-level vdev's ashift is not smaller
 		 * than our minimum ashift value. For the unusual case
 		 * where logical ashift > physical ashift, we can't cap
 		 * the calculated ashift based on max ashift as that
 		 * would cause failures.
 		 * We still check if we need to increase it to match
 		 * the min ashift.
 		 */
 		vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
 		    vd->vdev_ashift);
 	}
 }
 
 /*
  * Prepare a virtual device for access.
  */
 int
 vdev_open(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 	uint64_t osize = 0;
 	uint64_t max_osize = 0;
 	uint64_t asize, max_asize, psize;
 	uint64_t logical_ashift = 0;
 	uint64_t physical_ashift = 0;
 
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	vd->vdev_cant_read = B_FALSE;
 	vd->vdev_cant_write = B_FALSE;
 	vd->vdev_fault_wanted = B_FALSE;
 	vd->vdev_remove_wanted = B_FALSE;
 	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
 	/*
 	 * If this vdev is not removed, check its fault status.  If it's
 	 * faulted, bail out of the open.
 	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT0(vd->vdev_children);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	} else if (vd->vdev_offline) {
 		ASSERT0(vd->vdev_children);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
 		return (SET_ERROR(ENXIO));
 	}
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
 	    &logical_ashift, &physical_ashift);
 
 	/* Keep the device in removed state if unplugged */
 	if (error == ENOENT && vd->vdev_removed) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
 		    VDEV_AUX_NONE);
 		return (error);
 	}
 
 	/*
 	 * Physical volume size should never be larger than its max size, unless
 	 * the disk has shrunk while we were reading it or the device is buggy
 	 * or damaged: either way it's not safe for use, bail out of the open.
 	 */
 	if (osize > max_osize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_OPEN_FAILED);
 		return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Reset the vdev_reopening flag so that we actually close
 	 * the vdev on error.
 	 */
 	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
 		error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
 
 	if (error) {
 		if (vd->vdev_removed &&
 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
 			vd->vdev_removed = B_FALSE;
 
 		if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
 			    vd->vdev_stat.vs_aux);
 		} else {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    vd->vdev_stat.vs_aux);
 		}
 		return (error);
 	}
 
 	vd->vdev_removed = B_FALSE;
 
 	/*
 	 * Recheck the faulted flag now that we have confirmed that
 	 * the vdev is accessible.  If we're faulted, bail.
 	 */
 	if (vd->vdev_faulted) {
 		ASSERT0(vd->vdev_children);
 		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
 		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    vd->vdev_label_aux);
 		return (SET_ERROR(ENXIO));
 	}
 
 	if (vd->vdev_degraded) {
 		ASSERT0(vd->vdev_children);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
 	/*
 	 * For hole or missing vdevs we just return success.
 	 */
 	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
 		return (0);
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
 	}
 
 	osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
 	max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
 
 	if (vd->vdev_children == 0) {
 		if (osize < SPA_MINDEVSIZE) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = osize;
 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
 		max_asize = max_osize - (VDEV_LABEL_START_SIZE +
 		    VDEV_LABEL_END_SIZE);
 	} else {
 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_TOO_SMALL);
 			return (SET_ERROR(EOVERFLOW));
 		}
 		psize = 0;
 		asize = osize;
 		max_asize = max_osize;
 	}
 
 	/*
 	 * If the vdev was expanded, record this so that we can re-create the
 	 * uberblock rings in labels {2,3}, during the next sync.
 	 */
 	if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
 		vd->vdev_copy_uberblocks = B_TRUE;
 
 	vd->vdev_psize = psize;
 
 	/*
 	 * Make sure the allocatable size hasn't shrunk too much.
 	 */
 	if (asize < vd->vdev_min_asize) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * We can always set the logical/physical ashift members since
 	 * their values are only used to calculate the vdev_ashift when
 	 * the device is first added to the config. These values should
 	 * not be used for anything else since they may change whenever
 	 * the device is reopened and we don't store them in the label.
 	 */
 	vd->vdev_physical_ashift =
 	    MAX(physical_ashift, vd->vdev_physical_ashift);
 	vd->vdev_logical_ashift = MAX(logical_ashift,
 	    vd->vdev_logical_ashift);
 
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
 		 * For compatibility, a different ashift can be requested.
 		 */
 		vd->vdev_asize = asize;
 		vd->vdev_max_asize = max_asize;
 
 		/*
 		 * If the vdev_ashift was not overridden at creation time
 		 * (0) or the override value is impossible for the device,
 		 * then set it the logical ashift and optimize the ashift.
 		 */
 		if (vd->vdev_ashift < vd->vdev_logical_ashift) {
 			vd->vdev_ashift = vd->vdev_logical_ashift;
 
 			if (vd->vdev_logical_ashift > ASHIFT_MAX) {
 				vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 				    VDEV_AUX_ASHIFT_TOO_BIG);
 				return (SET_ERROR(EDOM));
 			}
 
 			if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
 				vdev_ashift_optimize(vd);
 			vd->vdev_attaching = B_FALSE;
 		}
 		if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
 		    vd->vdev_ashift > ASHIFT_MAX)) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_ASHIFT);
 			return (SET_ERROR(EDOM));
 		}
 	} else {
 		/*
 		 * Make sure the alignment required hasn't increased.
 		 */
 		if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
 		    vd->vdev_ops->vdev_op_leaf) {
 			(void) zfs_ereport_post(
 			    FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
 			    spa, vd, NULL, NULL, 0);
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_BAD_LABEL);
 			return (SET_ERROR(EDOM));
 		}
 		vd->vdev_max_asize = max_asize;
 	}
 
 	/*
 	 * If all children are healthy we update asize if either:
 	 * The asize has increased, due to a device expansion caused by dynamic
 	 * LUN growth or vdev replacement, and automatic expansion is enabled;
 	 * making the additional space available.
 	 *
 	 * The asize has decreased, due to a device shrink usually caused by a
 	 * vdev replace with a smaller device. This ensures that calculations
 	 * based of max_asize and asize e.g. esize are always valid. It's safe
 	 * to do this as we've already validated that asize is greater than
 	 * vdev_min_asize.
 	 */
 	if (vd->vdev_state == VDEV_STATE_HEALTHY &&
 	    ((asize > vd->vdev_asize &&
 	    (vd->vdev_expanding || spa->spa_autoexpand)) ||
 	    (asize < vd->vdev_asize)))
 		vd->vdev_asize = asize;
 
 	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
 	 * vdev open for business.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
 		    VDEV_AUX_ERR_EXCEEDED);
 		return (error);
 	}
 
 	/*
 	 * Track the minimum allocation size.
 	 */
 	if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
 	    vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
 		uint64_t min_alloc = vdev_get_min_alloc(vd);
 		vdev_spa_set_alloc(spa, min_alloc);
 	}
 
 	/*
 	 * If this is a leaf vdev, assess whether a resilver is needed.
 	 * But don't do this if we are doing a reopen for a scrub, since
 	 * this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
 		dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
 
 	return (0);
 }
 
 static void
 vdev_validate_child(void *arg)
 {
 	vdev_t *vd = arg;
 
 	vd->vdev_validate_thread = curthread;
 	vd->vdev_validate_error = vdev_validate(vd);
 	vd->vdev_validate_thread = NULL;
 }
 
 /*
  * Called once the vdevs are all opened, this routine validates the label
  * contents. This needs to be done before vdev_load() so that we don't
  * inadvertently do repair I/Os to the wrong device.
  *
  * This function will only return failure if one of the vdevs indicates that it
  * has since been destroyed or exported.  This is only possible if
  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
  * will be updated but the function will return 0.
  */
 int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	taskq_t *tq = NULL;
 	nvlist_t *label;
 	uint64_t guid = 0, aux_guid = 0, top_guid;
 	uint64_t state;
 	nvlist_t *nvl;
 	uint64_t txg;
 	int children = vd->vdev_children;
 
 	if (vdev_validate_skip)
 		return (0);
 
 	if (children > 0) {
 		tq = taskq_create("vdev_validate", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	for (uint64_t c = 0; c < children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			vdev_validate_child(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
 			    TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 	for (int c = 0; c < children; c++) {
 		int error = vd->vdev_child[c]->vdev_validate_error;
 
 		if (error != 0)
 			return (SET_ERROR(EBADF));
 	}
 
 
 	/*
 	 * If the device has already failed, or was marked offline, don't do
 	 * any further validation.  Otherwise, label I/O will fail and we will
 	 * overwrite the previous state.
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
 		return (0);
 
 	/*
 	 * If we are performing an extreme rewind, we allow for a label that
 	 * was modified at a point after the current txg.
 	 * If config lock is not held do not check for the txg. spa_sync could
 	 * be updating the vdev's label before updating spa_last_synced_txg.
 	 */
 	if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
 	    spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
 		txg = UINT64_MAX;
 	else
 		txg = spa_last_synced_txg(spa);
 
 	if ((label = vdev_label_read_config(vd, txg)) == NULL) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_BAD_LABEL);
 		vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
 		    "txg %llu", (u_longlong_t)txg);
 		return (0);
 	}
 
 	/*
 	 * Determine if this vdev has been split off into another
 	 * pool.  If so, then refuse to open it.
 	 */
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
 	    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_SPLIT_POOL);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (0);
 	}
 
 	/*
 	 * If config is not trusted then ignore the spa guid check. This is
 	 * necessary because if the machine crashed during a re-guid the new
 	 * guid might have been written to all of the vdev labels, but not the
 	 * cached config. The check will be performed again once we have the
 	 * trusted config from the MOS.
 	 */
 	if (spa->spa_trust_config && guid != spa_guid(spa)) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
 		    "match config (%llu != %llu)", (u_longlong_t)guid,
 		    (u_longlong_t)spa_guid(spa));
 		return (0);
 	}
 
 	if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
 	    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
 	    &aux_guid) != 0)
 		aux_guid = 0;
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_GUID);
 		return (0);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
 	    != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_TOP_GUID);
 		return (0);
 	}
 
 	/*
 	 * If this vdev just became a top-level vdev because its sibling was
 	 * detached, it will have adopted the parent's vdev guid -- but the
 	 * label may or may not be on disk yet. Fortunately, either version
 	 * of the label will have the same top guid, so if we're a top-level
 	 * vdev, we can safely compare to that instead.
 	 * However, if the config comes from a cachefile that failed to update
 	 * after the detach, a top-level vdev will appear as a non top-level
 	 * vdev in the config. Also relax the constraints if we perform an
 	 * extreme rewind.
 	 *
 	 * If we split this vdev off instead, then we also check the
 	 * original pool's guid. We don't want to consider the vdev
 	 * corrupt if it is partway through a split operation.
 	 */
 	if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
 		boolean_t mismatch = B_FALSE;
 		if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
 			if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
 				mismatch = B_TRUE;
 		} else {
 			if (vd->vdev_guid != top_guid &&
 			    vd->vdev_top->vdev_guid != guid)
 				mismatch = B_TRUE;
 		}
 
 		if (mismatch) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			nvlist_free(label);
 			vdev_dbgmsg(vd, "vdev_validate: config guid "
 			    "doesn't match label guid");
 			vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
 			    (u_longlong_t)vd->vdev_guid,
 			    (u_longlong_t)vd->vdev_top->vdev_guid);
 			vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
 			    "aux_guid %llu", (u_longlong_t)guid,
 			    (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
 			return (0);
 		}
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
 		    ZPOOL_CONFIG_POOL_STATE);
 		return (0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * If this is a verbatim import, no need to check the
 	 * state of the pool.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
 	    spa_load_state(spa) == SPA_LOAD_OPEN &&
 	    state != POOL_STATE_ACTIVE) {
 		vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
 		    "for spa %s", (u_longlong_t)state, spa->spa_name);
 		return (SET_ERROR(EBADF));
 	}
 
 	/*
 	 * If we were able to open and validate a vdev that was
 	 * previously marked permanently unavailable, clear that state
 	 * now.
 	 */
 	if (vd->vdev_not_present)
 		vd->vdev_not_present = 0;
 
 	return (0);
 }
 
 static void
 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
 {
 	if (svd != NULL && *dvd != NULL) {
 		if (strcmp(svd, *dvd) != 0) {
 			zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
 			    "from '%s' to '%s'", (u_longlong_t)guid, prefix,
 			    *dvd, svd);
 			spa_strfree(*dvd);
 			*dvd = spa_strdup(svd);
 		}
 	} else if (svd != NULL) {
 		*dvd = spa_strdup(svd);
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
 		    (u_longlong_t)guid, *dvd);
 	}
 }
 
 static void
 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
 {
 	char *old, *new;
 
 	vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
 	    dvd->vdev_guid);
 
 	vdev_update_path("vdev_physpath", svd->vdev_physpath,
 	    &dvd->vdev_physpath, dvd->vdev_guid);
 
 	/*
 	 * Our enclosure sysfs path may have changed between imports
 	 */
 	old = dvd->vdev_enc_sysfs_path;
 	new = svd->vdev_enc_sysfs_path;
 	if ((old != NULL && new == NULL) ||
 	    (old == NULL && new != NULL) ||
 	    ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
 		zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
 		    "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
 		    old, new);
 
 		if (dvd->vdev_enc_sysfs_path)
 			spa_strfree(dvd->vdev_enc_sysfs_path);
 
 		if (svd->vdev_enc_sysfs_path) {
 			dvd->vdev_enc_sysfs_path = spa_strdup(
 			    svd->vdev_enc_sysfs_path);
 		} else {
 			dvd->vdev_enc_sysfs_path = NULL;
 		}
 	}
 }
 
 /*
  * Recursively copy vdev paths from one vdev to another. Source and destination
  * vdev trees must have same geometry otherwise return error. Intended to copy
  * paths from userland config into MOS config.
  */
 int
 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
 {
 	if ((svd->vdev_ops == &vdev_missing_ops) ||
 	    (svd->vdev_ishole && dvd->vdev_ishole) ||
 	    (dvd->vdev_ops == &vdev_indirect_ops))
 		return (0);
 
 	if (svd->vdev_ops != dvd->vdev_ops) {
 		vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
 		    svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_guid != dvd->vdev_guid) {
 		vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
 		    "%llu)", (u_longlong_t)svd->vdev_guid,
 		    (u_longlong_t)dvd->vdev_guid);
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (svd->vdev_children != dvd->vdev_children) {
 		vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
 		    "%llu != %llu", (u_longlong_t)svd->vdev_children,
 		    (u_longlong_t)dvd->vdev_children);
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (uint64_t i = 0; i < svd->vdev_children; i++) {
 		int error = vdev_copy_path_strict(svd->vdev_child[i],
 		    dvd->vdev_child[i]);
 		if (error != 0)
 			return (error);
 	}
 
 	if (svd->vdev_ops->vdev_op_leaf)
 		vdev_copy_path_impl(svd, dvd);
 
 	return (0);
 }
 
 static void
 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
 {
 	ASSERT(stvd->vdev_top == stvd);
 	ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
 
 	for (uint64_t i = 0; i < dvd->vdev_children; i++) {
 		vdev_copy_path_search(stvd, dvd->vdev_child[i]);
 	}
 
 	if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
 		return;
 
 	/*
 	 * The idea here is that while a vdev can shift positions within
 	 * a top vdev (when replacing, attaching mirror, etc.) it cannot
 	 * step outside of it.
 	 */
 	vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
 
 	if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
 		return;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vdev_copy_path_impl(vd, dvd);
 }
 
 /*
  * Recursively copy vdev paths from one root vdev to another. Source and
  * destination vdev trees may differ in geometry. For each destination leaf
  * vdev, search a vdev with the same guid and top vdev id in the source.
  * Intended to copy paths from userland config into MOS config.
  */
 void
 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
 {
 	uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
 	ASSERT(srvd->vdev_ops == &vdev_root_ops);
 	ASSERT(drvd->vdev_ops == &vdev_root_ops);
 
 	for (uint64_t i = 0; i < children; i++) {
 		vdev_copy_path_search(srvd->vdev_child[i],
 		    drvd->vdev_child[i]);
 	}
 }
 
 /*
  * Close a virtual device.
  */
 void
 vdev_close(vdev_t *vd)
 {
 	vdev_t *pvd = vd->vdev_parent;
 	spa_t *spa __maybe_unused = vd->vdev_spa;
 
 	ASSERT(vd != NULL);
 	ASSERT(vd->vdev_open_thread == curthread ||
 	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/*
 	 * If our parent is reopening, then we are as well, unless we are
 	 * going offline.
 	 */
 	if (pvd != NULL && pvd->vdev_reopening)
 		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
 
 	vd->vdev_ops->vdev_op_close(vd);
 
 	/*
 	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
 	vd->vdev_prevstate = vd->vdev_state;
 
 	if (vd->vdev_offline)
 		vd->vdev_state = VDEV_STATE_OFFLINE;
 	else
 		vd->vdev_state = VDEV_STATE_CLOSED;
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
 void
 vdev_hold(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_is_root(spa));
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_hold(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
 		vd->vdev_ops->vdev_op_hold(vd);
 }
 
 void
 vdev_rele(vdev_t *vd)
 {
 	ASSERT(spa_is_root(vd->vdev_spa));
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_rele(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
 		vd->vdev_ops->vdev_op_rele(vd);
 }
 
 /*
  * Reopen all interior vdevs and any unopened leaves.  We don't actually
  * reopen leaf vdevs which had previously been opened as they might deadlock
  * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
  * If the leaf has never been opened then open it, as usual.
  */
 void
 vdev_reopen(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	/* set the reopening flag unless we're taking the vdev offline */
 	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
 	/*
 	 * Call vdev_validate() here to make sure we have the same device.
 	 * Otherwise, a device with an invalid label could be successfully
 	 * opened in response to vdev_reopen().
 	 */
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
 		    vd->vdev_aux == &spa->spa_l2cache) {
 			/*
 			 * In case the vdev is present we should evict all ARC
 			 * buffers and pointers to log blocks and reclaim their
 			 * space before restoring its contents to L2ARC.
 			 */
 			if (l2arc_vdev_present(vd)) {
 				l2arc_rebuild_vdev(vd, B_TRUE);
 			} else {
 				l2arc_add_vdev(spa, vd);
 			}
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	} else {
 		(void) vdev_validate(vd);
 	}
 
 	/*
 	 * Recheck if resilver is still needed and cancel any
 	 * scheduled resilver if resilver is unneeded.
 	 */
 	if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
 	    spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
 		mutex_enter(&spa->spa_async_lock);
 		spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
 		mutex_exit(&spa->spa_async_lock);
 	}
 
 	/*
 	 * Reassess parent vdev's health.
 	 */
 	vdev_propagate_state(vd);
 }
 
 int
 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 {
 	int error;
 
 	/*
 	 * Normally, partial opens (e.g. of a mirror) are allowed.
 	 * For a create, however, we want to fail the request if
 	 * there are any components we can't open.
 	 */
 	error = vdev_open(vd);
 
 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
 		vdev_close(vd);
 		return (error ? error : SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Recursively load DTLs and initialize all labels.
 	 */
 	if ((error = vdev_dtl_load(vd)) != 0 ||
 	    (error = vdev_label_init(vd, txg, isreplacing ?
 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
 		vdev_close(vd);
 		return (error);
 	}
 
 	return (0);
 }
 
 void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
 	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
 	 *
 	 * The default values used below are a good balance between memory
 	 * usage (larger metaslab size means more memory needed for loaded
 	 * metaslabs; more metaslabs means more memory needed for the
 	 * metaslab_t structs), metaslab load time (larger metaslabs take
 	 * longer to load), and metaslab sync time (more metaslabs means
 	 * more time spent syncing all of them).
 	 *
 	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
 	 * The range of the dimensions are as follows:
 	 *
 	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
 	 * at least 512MB (2^29) to minimize fragmentation effects when
 	 * testing with smaller devices.  However, the count constraint
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
 	 * size of 16GB.  However, we will cap the total count to 2^17
 	 * metaslabs to keep our memory footprint in check and let the
 	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
 	 *   vdev size       metaslab count
 	 *  --------------|-----------------
 	 *      < 8GB        ~16
 	 *  8GB   - 100GB   one per 512MB
 	 *  100GB - 3TB     ~200
 	 *  3TB   - 2PB     one per 16GB
 	 *      > 2PB       ~131,072
 	 *  --------------------------------
 	 *
 	 *  Finally, note that all of the above calculate the initial
 	 *  number of metaslabs. Expanding a top-level vdev will result
 	 *  in additional metaslabs being allocated making it possible
 	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
 	if (ms_count < zfs_vdev_min_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
 	else if (ms_count > zfs_vdev_default_ms_count)
 		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
 		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
 	} else if (ms_shift > zfs_vdev_max_ms_shift) {
 		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
 		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
 			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
 	ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
 	/* indirect vdevs don't have metaslabs or dtls */
 	ASSERT(vdev_is_concrete(vd) || flags == 0);
 	ASSERT(ISP2(flags));
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	if (flags & VDD_METASLAB)
 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
 
 	if (flags & VDD_DTL)
 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
 
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
 void
 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf)
 		vdev_dirty(vd->vdev_top, flags, vd, txg);
 }
 
 /*
  * DTLs.
  *
  * A vdev's DTL (dirty time log) is the set of transaction groups for which
  * the vdev has less than perfect replication.  There are four kinds of DTL:
  *
  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
  *
  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
  *
  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
  *	txgs that was scrubbed.
  *
  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
  *	persistent errors or just some device being offline.
  *	Unlike the other three, the DTL_OUTAGE map is not generally
  *	maintained; it's only computed when needed, typically to
  *	determine whether a device can be detached.
  *
  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
  * either has the data or it doesn't.
  *
  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
  * if any child is less than fully replicated, then so is its parent.
  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
  * comprising only those txgs which appear in 'maxfaults' or more children;
  * those are the txgs we don't have enough replication to read.  For example,
  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
  * two child DTL_MISSING maps.
  *
  * It should be clear from the above that to compute the DTLs and outage maps
  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
  * Therefore, that is all we keep on disk.  When loading the pool, or after
  * a configuration change, we generate all other DTLs from first principles.
  */
 void
 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 	ASSERT(spa_writeable(vd->vdev_spa));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!zfs_range_tree_contains(rt, txg, size))
 		zfs_range_tree_add(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 boolean_t
 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t dirty = B_FALSE;
 
 	ASSERT(t < DTL_TYPES);
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	/*
 	 * While we are loading the pool, the DTLs have not been loaded yet.
 	 * This isn't a problem but it can result in devices being tried
 	 * which are known to not have the data.  In which case, the import
 	 * is relying on the checksum to ensure that we get the right data.
 	 * Note that while importing we are only reading the MOS, which is
 	 * always checksummed.
 	 */
 	mutex_enter(&vd->vdev_dtl_lock);
 	if (!zfs_range_tree_is_empty(rt))
 		dirty = zfs_range_tree_contains(rt, txg, size);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (dirty);
 }
 
 boolean_t
 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
 {
 	zfs_range_tree_t *rt = vd->vdev_dtl[t];
 	boolean_t empty;
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	empty = zfs_range_tree_is_empty(rt);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (empty);
 }
 
 /*
  * Check if the txg falls within the range which must be
  * resilvered.  DVAs outside this range can always be skipped.
  */
 boolean_t
 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	(void) dva, (void) psize;
 
 	/* Set by sequential resilver. */
 	if (phys_birth == TXG_UNKNOWN)
 		return (B_TRUE);
 
 	return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
 }
 
 /*
  * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
  */
 boolean_t
 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
 	    vd->vdev_ops->vdev_op_leaf)
 		return (B_TRUE);
 
 	return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
 	    phys_birth));
 }
 
 /*
  * Returns the lowest txg in the DTL range.
  */
 static uint64_t
 vdev_dtl_min(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
 }
 
 /*
  * Returns the highest txg in the DTL.
  */
 static uint64_t
 vdev_dtl_max(vdev_t *vd)
 {
 	ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
 	ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
 	ASSERT0(vd->vdev_children);
 
 	return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING]));
 }
 
 /*
  * Determine if a resilvering vdev should remove any DTL entries from
  * its range. If the vdev was resilvering for the entire duration of the
  * scan then it should excise that range from its DTLs. Otherwise, this
  * vdev is considered partially resilvered and should leave its DTL
  * entries intact. The comment in vdev_dtl_reassess() describes how we
  * excise the DTLs.
  */
 static boolean_t
 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
 {
 	ASSERT0(vd->vdev_children);
 
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
 	if (vd->vdev_resilver_deferred)
 		return (B_FALSE);
 
 	if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
 
 	if (rebuild_done) {
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 
 		/* Rebuild not initiated by attach */
 		if (vd->vdev_rebuild_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a rebuild completes without error then all missing data
 		 * up to the rebuild max txg has been reconstructed and the DTL
 		 * is eligible for excision.
 		 */
 		if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
 		    vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
 			ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
 			ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
 			return (B_TRUE);
 		}
 	} else {
 		dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
 		dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
 
 		/* Resilver not initiated by attach */
 		if (vd->vdev_resilver_txg == 0)
 			return (B_TRUE);
 
 		/*
 		 * When a resilver is initiated the scan will assign the
 		 * scn_max_txg value to the highest txg value that exists
 		 * in all DTLs. If this device's max DTL is not part of this
 		 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
 		 * then it is not eligible for excision.
 		 */
 		if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
 			ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
 			ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
 			ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
 			return (B_TRUE);
 		}
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Reassess DTLs after a config change or scrub completion. If txg == 0 no
  * write operations will be issued to the pool.
  */
 static void
 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting)
 {
 	spa_t *spa = vd->vdev_spa;
 	avl_tree_t reftree;
 	int minref;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_dtl_reassess_impl(vd->vdev_child[c], txg,
 		    scrub_txg, scrub_done, rebuild_done, faulting);
 
 	if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 		vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
 		boolean_t check_excise = B_FALSE;
 		boolean_t wasempty = B_TRUE;
 
 		mutex_enter(&vd->vdev_dtl_lock);
 
 		/*
 		 * If requested, pretend the scan or rebuild completed cleanly.
 		 */
 		if (zfs_scan_ignore_errors) {
 			if (scn != NULL)
 				scn->scn_phys.scn_errors = 0;
 			if (vr != NULL)
 				vr->vr_rebuild_phys.vrp_errors = 0;
 		}
 
 		if (scrub_txg != 0 &&
 		    !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
 			wasempty = B_FALSE;
 			zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
 			    "dtl:%llu/%llu errors:%llu",
 			    (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
 			    (u_longlong_t)scrub_txg, spa->spa_scrub_started,
 			    (u_longlong_t)vdev_dtl_min(vd),
 			    (u_longlong_t)vdev_dtl_max(vd),
 			    (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
 		}
 
 		/*
 		 * If we've completed a scrub/resilver or a rebuild cleanly
 		 * then determine if this vdev should remove any DTLs. We
 		 * only want to excise regions on vdevs that were available
 		 * during the entire duration of this scan.
 		 */
 		if (rebuild_done &&
 		    vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
 			check_excise = B_TRUE;
 		} else {
 			if (spa->spa_scrub_started ||
 			    (scn != NULL && scn->scn_phys.scn_errors == 0)) {
 				check_excise = B_TRUE;
 			}
 		}
 
 		if (scrub_txg && check_excise &&
 		    vdev_dtl_should_excise(vd, rebuild_done)) {
 			/*
 			 * We completed a scrub, resilver or rebuild up to
 			 * scrub_txg.  If we did it without rebooting, then
 			 * the scrub dtl will be valid, so excise the old
 			 * region and fold in the scrub dtl.  Otherwise,
 			 * leave the dtl as-is if there was an error.
 			 *
 			 * There's little trick here: to excise the beginning
 			 * of the DTL_MISSING map, we put it into a reference
 			 * tree and then add a segment with refcnt -1 that
 			 * covers the range [0, scrub_txg).  This means
 			 * that each txg in that range has refcnt -1 or 0.
 			 * We then add DTL_SCRUB with a refcnt of 2, so that
 			 * entries in the range [0, scrub_txg) will have a
 			 * positive refcnt -- either 1 or 2.  We then convert
 			 * the reference tree into the new DTL_MISSING map.
 			 */
 			space_reftree_create(&reftree);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
 			space_reftree_add_map(&reftree,
 			    vd->vdev_dtl[DTL_SCRUB], 2);
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[DTL_MISSING], 1);
 			space_reftree_destroy(&reftree);
 
 			if (!zfs_range_tree_is_empty(
 			    vd->vdev_dtl[DTL_MISSING])) {
 				zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
 				    (u_longlong_t)vdev_dtl_min(vd),
 				    (u_longlong_t)vdev_dtl_max(vd));
 			} else if (!wasempty) {
 				zfs_dbgmsg("DTL_MISSING is now empty");
 			}
 		}
 		zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
 		zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 		    zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
 			zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL,
 			    NULL);
 		zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
 
 		/*
 		 * For the faulting case, treat members of a replacing vdev
 		 * as if they are not available. It's more likely than not that
 		 * a vdev in a replacing vdev could encounter read errors so
 		 * treat it as not being able to contribute.
 		 */
 		if (!vdev_readable(vd) ||
 		    (faulting && vd->vdev_parent != NULL &&
 		    vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) {
 			zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
 		} else {
 			zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
 			    zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 		}
 
 		/*
 		 * If the vdev was resilvering or rebuilding and no longer
 		 * has any DTLs then reset the appropriate flag and dirty
 		 * the top level so that we persist the change.
 		 */
 		if (txg != 0 &&
 		    zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
 			if (vd->vdev_rebuild_txg != 0) {
 				vd->vdev_rebuild_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			} else if (vd->vdev_resilver_txg != 0) {
 				vd->vdev_resilver_txg = 0;
 				vdev_config_dirty(vd->vdev_top);
 			}
 		}
 
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
 	} else {
 		mutex_enter(&vd->vdev_dtl_lock);
 		for (int t = 0; t < DTL_TYPES; t++) {
 			/* account for child's outage in parent's missing map */
 			int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
 			if (t == DTL_SCRUB) {
 				/* leaf vdevs only */
 				continue;
 			}
 			if (t == DTL_PARTIAL) {
 				/* i.e. non-zero */
 				minref = 1;
 			} else if (vdev_get_nparity(vd) != 0) {
 				/* RAIDZ, DRAID */
 				minref = vdev_get_nparity(vd) + 1;
 			} else {
 				/* any kind of mirror */
 				minref = vd->vdev_children;
 			}
 			space_reftree_create(&reftree);
 			for (int c = 0; c < vd->vdev_children; c++) {
 				vdev_t *cvd = vd->vdev_child[c];
 				mutex_enter(&cvd->vdev_dtl_lock);
 				space_reftree_add_map(&reftree,
 				    cvd->vdev_dtl[s], 1);
 				mutex_exit(&cvd->vdev_dtl_lock);
 			}
 			space_reftree_generate_map(&reftree,
 			    vd->vdev_dtl[t], minref);
 			space_reftree_destroy(&reftree);
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	}
 
 	if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
 		raidz_dtl_reassessed(vd);
 	}
 }
 
 void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     boolean_t scrub_done, boolean_t rebuild_done)
 {
 	return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done,
 	    rebuild_done, B_FALSE));
 }
 
 /*
  * Iterate over all the vdevs except spare, and post kobj events
  */
 void
 vdev_post_kobj_evt(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_kobj_evt_post &&
 	    vd->vdev_kobj_flag == B_FALSE) {
 		vd->vdev_kobj_flag = B_TRUE;
 		vd->vdev_ops->vdev_op_kobj_evt_post(vd);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_post_kobj_evt(vd->vdev_child[c]);
 }
 
 /*
  * Iterate over all the vdevs except spare, and clear kobj events
  */
 void
 vdev_clear_kobj_evt(vdev_t *vd)
 {
 	vd->vdev_kobj_flag = B_FALSE;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear_kobj_evt(vd->vdev_child[c]);
 }
 
 int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_range_tree_t *rt;
 	int error = 0;
 
 	if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
 		ASSERT(vdev_is_concrete(vd));
 
 		/*
 		 * If the dtl cannot be sync'd there is no need to open it.
 		 */
 		if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
 			return (0);
 
 		error = space_map_open(&vd->vdev_dtl_sm, mos,
 		    vd->vdev_dtl_object, 0, -1ULL, 0);
 		if (error)
 			return (error);
 		ASSERT(vd->vdev_dtl_sm != NULL);
 
 		rt = zfs_range_tree_create_flags(
 		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 		    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt"));
 		error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
 		if (error == 0) {
 			mutex_enter(&vd->vdev_dtl_lock);
 			zfs_range_tree_walk(rt, zfs_range_tree_add,
 			    vd->vdev_dtl[DTL_MISSING]);
 			mutex_exit(&vd->vdev_dtl_lock);
 		}
 
 		zfs_range_tree_vacate(rt, NULL, NULL);
 		zfs_range_tree_destroy(rt);
 
 		return (error);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		error = vdev_dtl_load(vd->vdev_child[c]);
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static void
 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
 	const char *string;
 
 	ASSERT(alloc_bias != VDEV_BIAS_NONE);
 
 	string =
 	    (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
 	    (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
 	    (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
 
 	ASSERT(string != NULL);
 	VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
 	    1, strlen(string) + 1, string, tx));
 
 	if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
 		spa_activate_allocation_classes(spa, tx);
 	}
 }
 
 void
 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
 	VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zapobj, tx));
 }
 
 uint64_t
 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
 	    DMU_OT_NONE, 0, tx);
 
 	ASSERT(zap != 0);
 	VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
 	    zap, tx));
 
 	return (zap);
 }
 
 void
 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ops != &vdev_hole_ops &&
 	    vd->vdev_ops != &vdev_missing_ops &&
 	    vd->vdev_ops != &vdev_root_ops &&
 	    !vd->vdev_top->vdev_removing) {
 		if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
 			vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
 		}
 		if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
 			vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
 			if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
 				vdev_zap_allocation_data(vd, tx);
 		}
 	}
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
 	    spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
 		if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
 			spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
 		vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		vdev_construct_zaps(vd->vdev_child[i], tx);
 	}
 }
 
 static void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	zfs_range_tree_t *rtsync;
 	dmu_tx_t *tx;
 	uint64_t object = space_map_object(vd->vdev_dtl_sm);
 
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		space_map_free(vd->vdev_dtl_sm, tx);
 		space_map_close(vd->vdev_dtl_sm);
 		vd->vdev_dtl_sm = NULL;
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		/*
 		 * We only destroy the leaf ZAP for detached leaves or for
 		 * removed log devices. Removed data devices handle leaf ZAP
 		 * cleanup later, once cancellation is no longer possible.
 		 */
 		if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
 		    vd->vdev_top->vdev_islog)) {
 			vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
 			vd->vdev_leaf_zap = 0;
 		}
 
 		dmu_tx_commit(tx);
 		return;
 	}
 
 	if (vd->vdev_dtl_sm == NULL) {
 		uint64_t new_object;
 
 		new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
 		VERIFY3U(new_object, !=, 0);
 
 		VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
 		    0, -1ULL, 0));
 		ASSERT(vd->vdev_dtl_sm != NULL);
 	}
 
 	rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync"));
 
 	mutex_enter(&vd->vdev_dtl_lock);
 	zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
 	space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
 	zfs_range_tree_vacate(rtsync, NULL, NULL);
 
 	zfs_range_tree_destroy(rtsync);
 
 	/*
 	 * If the object for the space map has changed then dirty
 	 * the top level so that we update the config.
 	 */
 	if (object != space_map_object(vd->vdev_dtl_sm)) {
 		vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
 		    "new object %llu", (u_longlong_t)txg, spa_name(spa),
 		    (u_longlong_t)object,
 		    (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
 		vdev_config_dirty(vd->vdev_top);
 	}
 
 	dmu_tx_commit(tx);
 }
 
 /*
  * Determine whether the specified vdev can be
  * - offlined
  * - detached
  * - removed
  * - faulted
  * without losing data.
  */
 boolean_t
 vdev_dtl_required(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *tvd = vd->vdev_top;
 	uint8_t cant_read = vd->vdev_cant_read;
 	boolean_t required;
 	boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == spa->spa_root_vdev || vd == tvd)
 		return (B_TRUE);
 
 	/*
 	 * Temporarily mark the device as unreadable, and then determine
 	 * whether this results in any DTL outages in the top-level vdev.
 	 * If not, we can safely offline/detach/remove the device.
 	 */
 	vd->vdev_cant_read = B_TRUE;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
 	vd->vdev_cant_read = cant_read;
 	vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
 
 	if (!required && zio_injection_enabled) {
 		required = !!zio_handle_device_injection(vd, NULL,
 		    SET_ERROR(ECHILD));
 	}
 
 	return (required);
 }
 
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
 boolean_t
 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 {
 	boolean_t needed = B_FALSE;
 	uint64_t thismin = UINT64_MAX;
 	uint64_t thismax = 0;
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
 		    vdev_writeable(vd)) {
 
 			thismin = vdev_dtl_min(vd);
 			thismax = vdev_dtl_max(vd);
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
 				thismin = MIN(thismin, cmin);
 				thismax = MAX(thismax, cmax);
 				needed = B_TRUE;
 			}
 		}
 	}
 
 	if (needed && minp) {
 		*minp = thismin;
 		*maxp = thismax;
 	}
 	return (needed);
 }
 
 /*
  * Gets the checkpoint space map object from the vdev's ZAP.  On success sm_obj
  * will contain either the checkpoint spacemap object or zero if none exists.
  * All other errors are returned to the caller.
  */
 int
 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
 {
 	ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
 
 	if (vd->vdev_top_zap == 0) {
 		*sm_obj = 0;
 		return (0);
 	}
 
 	int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
 	if (error == ENOENT) {
 		*sm_obj = 0;
 		error = 0;
 	}
 
 	return (error);
 }
 
 int
 vdev_load(vdev_t *vd)
 {
 	int children = vd->vdev_children;
 	int error = 0;
 	taskq_t *tq = NULL;
 
 	/*
 	 * It's only worthwhile to use the taskq for the root vdev, because the
 	 * slow part is metaslab_init, and that only happens for top-level
 	 * vdevs.
 	 */
 	if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
 		tq = taskq_create("vdev_load", children, minclsyspri,
 		    children, children, TASKQ_PREPOPULATE);
 	}
 
 	/*
 	 * Recursively load all children.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		if (tq == NULL || vdev_uses_zvols(cvd)) {
 			cvd->vdev_load_error = vdev_load(cvd);
 		} else {
 			VERIFY(taskq_dispatch(tq, vdev_load_child,
 			    cvd, TQ_SLEEP) != TASKQID_INVALID);
 		}
 	}
 
 	if (tq != NULL) {
 		taskq_wait(tq);
 		taskq_destroy(tq);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		int error = vd->vdev_child[c]->vdev_load_error;
 
 		if (error != 0)
 			return (error);
 	}
 
 	vdev_set_deflate_ratio(vd);
 
 	if (vd->vdev_ops == &vdev_raidz_ops) {
 		error = vdev_raidz_load(vd);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * On spa_load path, grab the allocation bias from our zap
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		char bias_str[64];
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
 		    bias_str);
 		if (error == 0) {
 			ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
 			vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
 		} else if (error != ENOENT) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 			return (error);
 		}
 	}
 
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		spa_t *spa = vd->vdev_spa;
 		uint64_t failfast;
 
 		error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
 		    vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
 		    1, &failfast);
 		if (error == 0) {
 			vd->vdev_failfast = failfast & 1;
 		} else if (error == ENOENT) {
 			vd->vdev_failfast = vdev_prop_default_numeric(
 			    VDEV_PROP_FAILFAST);
 		} else {
 			vdev_dbgmsg(vd,
 			    "vdev_load: zap_lookup(top_zap=%llu) "
 			    "failed [error=%d]",
 			    (u_longlong_t)vd->vdev_top_zap, error);
 		}
 	}
 
 	/*
 	 * Load any rebuild state from the top-level vdev zap.
 	 */
 	if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
 		error = vdev_rebuild_load(vd);
 		if (error && error != ENOTSUP) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
 			    "failed [error=%d]", error);
 			return (error);
 		}
 	}
 
 	if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
 		uint64_t zapobj;
 
 		if (vd->vdev_top_zap != 0)
 			zapobj = vd->vdev_top_zap;
 		else
 			zapobj = vd->vdev_leaf_zap;
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
 		    &vd->vdev_checksum_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
 		    &vd->vdev_checksum_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
 		    &vd->vdev_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
 		    &vd->vdev_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
 		    &vd->vdev_slow_io_n);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 
 		error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
 		    &vd->vdev_slow_io_t);
 		if (error && error != ENOENT)
 			vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
 			    "failed [error=%d]", (u_longlong_t)zapobj, error);
 	}
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
 	if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 
 		if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
 			    "asize=%llu", (u_longlong_t)vd->vdev_ashift,
 			    (u_longlong_t)vd->vdev_asize);
 			return (SET_ERROR(ENXIO));
 		}
 
 		error = vdev_metaslab_init(vd, 0);
 		if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
 			    "[error=%d]", error);
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			return (error);
 		}
 
 		uint64_t checkpoint_sm_obj;
 		error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
 		if (error == 0 && checkpoint_sm_obj != 0) {
 			objset_t *mos = spa_meta_objset(vd->vdev_spa);
 			ASSERT(vd->vdev_asize != 0);
 			ASSERT0P(vd->vdev_checkpoint_sm);
 
 			error = space_map_open(&vd->vdev_checkpoint_sm,
 			    mos, checkpoint_sm_obj, 0, vd->vdev_asize,
 			    vd->vdev_ashift);
 			if (error != 0) {
 				vdev_dbgmsg(vd, "vdev_load: space_map_open "
 				    "failed for checkpoint spacemap (obj %llu) "
 				    "[error=%d]",
 				    (u_longlong_t)checkpoint_sm_obj, error);
 				return (error);
 			}
 			ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
 
 			/*
 			 * Since the checkpoint_sm contains free entries
 			 * exclusively we can use space_map_allocated() to
 			 * indicate the cumulative checkpointed space that
 			 * has been freed.
 			 */
 			vd->vdev_stat.vs_checkpoint_space =
 			    -space_map_allocated(vd->vdev_checkpoint_sm);
 			vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
 			    vd->vdev_stat.vs_checkpoint_space;
 		} else if (error != 0) {
 			vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
 			    "checkpoint space map object from vdev ZAP "
 			    "[error=%d]", error);
 			return (error);
 		}
 	}
 
 	/*
 	 * If this is a leaf vdev, load its DTL.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
 		    "[error=%d]", error);
 		return (error);
 	}
 
 	uint64_t obsolete_sm_object;
 	error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
 	if (error == 0 && obsolete_sm_object != 0) {
 		objset_t *mos = vd->vdev_spa->spa_meta_objset;
 		ASSERT(vd->vdev_asize != 0);
 		ASSERT0P(vd->vdev_obsolete_sm);
 
 		if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
 		    obsolete_sm_object, 0, vd->vdev_asize, 0))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 			vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
 			    "obsolete spacemap (obj %llu) [error=%d]",
 			    (u_longlong_t)obsolete_sm_object, error);
 			return (error);
 		}
 	} else if (error != 0) {
 		vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
 		    "space map object from vdev ZAP [error=%d]", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * The special vdev case is used for hot spares and l2cache devices.  Its
  * sole purpose it to set the vdev state for the associated vdev.  To do this,
  * we make sure that we can open the underlying device, then try to read the
  * label, and make sure that the label is sane and that it hasn't been
  * repurposed to another pool.
  */
 int
 vdev_validate_aux(vdev_t *vd)
 {
 	nvlist_t *label;
 	uint64_t guid, version;
 	uint64_t state;
 
 	if (!vdev_readable(vd))
 		return (0);
 
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		return (-1);
 	}
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
 	    !SPA_VERSION_IS_SUPPORTED(version) ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
 	    guid != vd->vdev_guid ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
 		    VDEV_AUX_CORRUPT_DATA);
 		nvlist_free(label);
 		return (-1);
 	}
 
 	/*
 	 * We don't actually check the pool state here.  If it's in fact in
 	 * use by another pool, we update this fact on the fly when requested.
 	 */
 	nvlist_free(label);
 	return (0);
 }
 
 static void
 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
 {
 	objset_t *mos = spa_meta_objset(vd->vdev_spa);
 
 	if (vd->vdev_top_zap == 0)
 		return;
 
 	uint64_t object = 0;
 	int err = zap_lookup(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
 	if (err == ENOENT)
 		return;
 	VERIFY0(err);
 
 	VERIFY0(dmu_object_free(mos, object, tx));
 	VERIFY0(zap_remove(mos, vd->vdev_top_zap,
 	    VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
 }
 
 /*
  * Free the objects used to store this vdev's spacemaps, and the array
  * that points to them.
  */
 void
 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
 {
 	if (vd->vdev_ms_array == 0)
 		return;
 
 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
 	uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
 	size_t array_bytes = array_count * sizeof (uint64_t);
 	uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
 	VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
 	    array_bytes, smobj_array, 0));
 
 	for (uint64_t i = 0; i < array_count; i++) {
 		uint64_t smobj = smobj_array[i];
 		if (smobj == 0)
 			continue;
 
 		space_map_free_obj(mos, smobj, tx);
 	}
 
 	kmem_free(smobj_array, array_bytes);
 	VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
 	vdev_destroy_ms_flush_data(vd, tx);
 	vd->vdev_ms_array = 0;
 }
 
 static void
 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	vdev_destroy_spacemaps(vd, tx);
 	if (vd->vdev_top_zap != 0) {
 		vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
 		vd->vdev_top_zap = 0;
 	}
 
 	dmu_tx_commit(tx);
 }
 
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
 	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
 
 	ASSERT(vdev_is_concrete(vd));
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess) {
 		metaslab_sync_reassess(vd->vdev_mg);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_sync_reassess(vd->vdev_log_mg);
 	}
 }
 
 void
 vdev_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *lvd;
 	metaslab_t *msp;
 
 	ASSERT3U(txg, ==, spa->spa_syncing_txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) {
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 
 		vdev_indirect_sync_obsolete(vd, tx);
 
 		/*
 		 * If the vdev is indirect, it can't have dirty
 		 * metaslabs or DTLs.
 		 */
 		if (vd->vdev_ops == &vdev_indirect_ops) {
 			ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
 			ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 
 	ASSERT(vdev_is_concrete(vd));
 
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
 	    !vd->vdev_removing) {
 		ASSERT(vd == vd->vdev_top);
 		ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
 		ASSERT(vd->vdev_ms_array != 0);
 		vdev_config_dirty(vd);
 	}
 
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
 	}
 
 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
 		vdev_dtl_sync(lvd, txg);
 
 	/*
 	 * If this is an empty log device being removed, destroy the
 	 * metadata associated with it.
 	 */
 	if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
 		vdev_remove_empty_log(vd, txg);
 
 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
 	dmu_tx_commit(tx);
 }
 uint64_t
 vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg)
 {
 	return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg));
 }
 
 /*
  * Return the amount of space that should be (or was) allocated for the given
  * psize (compressed block size) in the given TXG. Note that for expanded
  * RAIDZ vdevs, the size allocated for older BP's may be larger. See
  * vdev_raidz_psize_to_asize().
  */
 uint64_t
 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
 {
 	return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg));
 }
 
 uint64_t
 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
 {
 	return (vdev_psize_to_asize_txg(vd, psize, 0));
 }
 
 /*
  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
  * not be opened, and no I/O is attempted.
  */
 int
 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd, *tvd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	tvd = vd->vdev_top;
 
 	/*
 	 * If user did a 'zpool offline -f' then make the fault persist across
 	 * reboots.
 	 */
 	if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
 		/*
 		 * There are two kinds of forced faults: temporary and
 		 * persistent.  Temporary faults go away at pool import, while
 		 * persistent faults stay set.  Both types of faults can be
 		 * cleared with a zpool clear.
 		 *
 		 * We tell if a vdev is persistently faulted by looking at the
 		 * ZPOOL_CONFIG_AUX_STATE nvpair.  If it's set to "external" at
 		 * import then it's a persistent fault.  Otherwise, it's
 		 * temporary.  We get ZPOOL_CONFIG_AUX_STATE set to "external"
 		 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL.  This
 		 * tells vdev_config_generate() (which gets run later) to set
 		 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
 		 */
 		vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
 		vd->vdev_tmpoffline = B_FALSE;
 		aux = VDEV_AUX_EXTERNAL;
 	} else {
 		vd->vdev_tmpoffline = B_TRUE;
 	}
 
 	/*
 	 * We don't directly use the aux state here, but if we do a
 	 * vdev_reopen(), we need this value to be present to remember why we
 	 * were faulted.
 	 */
 	vd->vdev_label_aux = aux;
 
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_delayed_close = B_FALSE;
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If this device has the only valid copy of the data, then
 	 * back off and simply mark the vdev as degraded instead.
 	 */
 	if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
 		/*
 		 * If we reopen the device and it's not dead, only then do we
 		 * mark it degraded.
 		 */
 		vdev_reopen(tvd);
 
 		if (vdev_readable(vd))
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 /*
  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
  * user that something is wrong.  The vdev continues to operate as normal as far
  * as I/O is concerned.
  */
 int
 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	/*
 	 * If the vdev is already faulted, then don't do anything.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
 		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_remove_wanted(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	/*
 	 * If the vdev is already removed, or expanding which can trigger
 	 * repartition add/remove events, then don't do anything.
 	 */
 	if (vd->vdev_removed || vd->vdev_expanding)
 		return (spa_vdev_state_exit(spa, NULL, 0));
 
 	/*
 	 * Confirm the vdev has been removed, otherwise don't do anything.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
 
 	vd->vdev_remove_wanted = B_TRUE;
 	spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 
 /*
  * Online the given vdev.
  *
  * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things.  First, any attached
  * spare device should be detached when the device finishes resilvering.
  * Second, the online should be treated like a 'test' online case, so no FMA
  * events are generated if the device fails to open.
  */
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
 	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 	boolean_t wasoffline;
 	vdev_state_t oldstate;
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
 	oldstate = vd->vdev_state;
 
 	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
 
 	/* XXX - L2ARC 1.0 does not support expansion */
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
 			    spa->spa_autoexpand);
 		vd->vdev_expansion_time = gethrestime_sec();
 	}
 
 	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
 	if (!vd->vdev_aux) {
 		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 			pvd->vdev_expanding = B_FALSE;
 	}
 
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
 	    !vdev_is_dead(vd) && vd->vdev_parent &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
 		/* XXX - L2ARC 1.0 does not support expansion */
 		if (vd->vdev_aux)
 			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
 		spa->spa_ccw_fail_time = 0;
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
 	/* Restart initializing if necessary */
 	mutex_enter(&vd->vdev_initialize_lock);
 	if (vdev_writeable(vd) &&
 	    vd->vdev_initialize_thread == NULL &&
 	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
 		(void) vdev_initialize(vd);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	/*
 	 * Restart trimming if necessary. We do not restart trimming for cache
 	 * devices here. This is triggered by l2arc_rebuild_vdev()
 	 * asynchronously for the whole device or in l2arc_evict() as it evicts
 	 * space for upcoming writes.
 	 */
 	mutex_enter(&vd->vdev_trim_lock);
 	if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
 	    vd->vdev_trim_thread == NULL &&
 	    vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
 		(void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
 		    vd->vdev_trim_secure);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED)) {
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
 
 		/*
 		 * Asynchronously detach spare vdev if resilver or
 		 * rebuild is not required
 		 */
 		if (vd->vdev_unspare &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
 		    !vdev_rebuild_active(tvd))
 			spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
 	}
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 static int
 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	vdev_t *vd, *tvd;
 	int error = 0;
 	uint64_t generation;
 	metaslab_group_t *mg;
 
 top:
 	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
 
 	if (vd->vdev_ops == &vdev_draid_spare_ops)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	tvd = vd->vdev_top;
 	mg = tvd->vdev_mg;
 	generation = spa->spa_config_generation + 1;
 
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
 		 * If this device has the only valid copy of some data,
 		 * don't allow it to be offlined. Log devices are always
 		 * expendable.
 		 */
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 
 		/*
 		 * If the top-level is a slog and it has had allocations
 		 * then proceed.  We check that the vdev's metaslab group
 		 * is not NULL since it's possible that we may have just
 		 * added this vdev but not yet initialized its metaslabs.
 		 */
 		if (tvd->vdev_islog && mg != NULL) {
 			/*
 			 * Prevent any future allocations.
 			 */
 			ASSERT0P(tvd->vdev_log_mg);
 			metaslab_group_passivate(mg);
 			(void) spa_vdev_state_exit(spa, vd, 0);
 
 			error = spa_reset_logs(spa);
 
 			/*
 			 * If the log device was successfully reset but has
 			 * checkpointed data, do not offline it.
 			 */
 			if (error == 0 &&
 			    tvd->vdev_checkpoint_sm != NULL) {
 				ASSERT3U(space_map_allocated(
 				    tvd->vdev_checkpoint_sm), !=, 0);
 				error = ZFS_ERR_CHECKPOINT_EXISTS;
 			}
 
 			spa_vdev_state_enter(spa, SCL_ALLOC);
 
 			/*
 			 * Check to see if the config has changed.
 			 */
 			if (error || generation != spa->spa_config_generation) {
 				metaslab_group_activate(mg);
 				if (error)
 					return (spa_vdev_state_exit(spa,
 					    vd, error));
 				(void) spa_vdev_state_exit(spa, vd, 0);
 				goto top;
 			}
 			ASSERT0(tvd->vdev_stat.vs_alloc);
 		}
 
 		/*
 		 * Offline this device and reopen its top-level vdev.
 		 * If the top-level vdev is a log device then just offline
 		 * it. Otherwise, if this action results in the top-level
 		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
 		vdev_reopen(tvd);
 
 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
 			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL,
 			    SET_ERROR(EBUSY)));
 		}
 
 		/*
 		 * Add the device back into the metaslab rotor so that
 		 * once we online the device it's open for business.
 		 */
 		if (tvd->vdev_islog && mg != NULL)
 			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
 int
 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 {
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	error = vdev_offline_locked(spa, guid, flags);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
  */
 void
 vdev_clear(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (vd == NULL)
 		vd = rvd;
 
 	vd->vdev_stat.vs_read_errors = 0;
 	vd->vdev_stat.vs_write_errors = 0;
 	vd->vdev_stat.vs_checksum_errors = 0;
 	vd->vdev_stat.vs_dio_verify_errors = 0;
 	vd->vdev_stat.vs_slow_ios = 0;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_clear(spa, vd->vdev_child[c]);
 
 	/*
 	 * It makes no sense to "clear" an indirect  or removed vdev.
 	 */
 	if (!vdev_is_concrete(vd) || vd->vdev_removed)
 		return;
 
 	/*
 	 * If we're in the FAULTED state or have experienced failed I/O, then
 	 * clear the persistent state and attempt to reopen the device.  We
 	 * also mark the vdev config dirty, so that the new faulted state is
 	 * written out to disk.
 	 */
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 		/*
 		 * When reopening in response to a clear event, it may be due to
 		 * a fmadm repair request.  In this case, if the device is
 		 * still broken, we want to still post the ereport again.
 		 */
 		vd->vdev_forcefault = B_TRUE;
 
 		vd->vdev_faulted = vd->vdev_degraded = 0ULL;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 		vd->vdev_stat.vs_aux = 0;
 
 		vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
 
 		vd->vdev_forcefault = B_FALSE;
 
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
 		/* If a resilver isn't required, check if vdevs can be culled */
 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
 		    !dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
 
 	/*
 	 * When clearing a FMA-diagnosed fault, we always want to
 	 * unspare the device, as we assume that the original spare was
 	 * done in response to the FMA fault.
 	 */
 	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
 	/* Clear recent error events cache (i.e. duplicate events tracking) */
 	zfs_ereport_clear(spa, vd);
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
 	/*
 	 * Holes and missing devices are always considered "dead".
 	 * This simplifies the code since we don't have to check for
 	 * these types of devices in the various code paths.
 	 * Instead we rely on the fact that we skip over dead devices
 	 * before issuing I/O to them.
 	 */
 	return (vd->vdev_state < VDEV_STATE_DEGRADED ||
 	    vd->vdev_ops == &vdev_hole_ops ||
 	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
 vdev_readable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
 }
 
 boolean_t
 vdev_writeable(vdev_t *vd)
 {
 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
 	    vdev_is_concrete(vd));
 }
 
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
 	uint64_t state = vd->vdev_state;
 
 	/*
 	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
 	 * the proper locks.  Note that we have to get the vdev state
 	 * in a local variable because although it changes atomically,
 	 * we're asking two separate questions about it.
 	 */
 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
 	    !vd->vdev_cant_write && vdev_is_concrete(vd) &&
 	    vd->vdev_mg->mg_initialized);
 }
 
 boolean_t
 vdev_accessible(vdev_t *vd, zio_t *zio)
 {
 	ASSERT(zio->io_vd == vd);
 
 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
 		return (B_FALSE);
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		return (!vd->vdev_cant_read);
 
 	if (zio->io_type == ZIO_TYPE_WRITE)
 		return (!vd->vdev_cant_write);
 
 	return (B_TRUE);
 }
 
 static void
 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
 {
 	/*
 	 * Exclude the dRAID spare when aggregating to avoid double counting
 	 * the ops and bytes.  These IOs are counted by the physical leaves.
 	 */
 	if (cvd->vdev_ops == &vdev_draid_spare_ops)
 		return;
 
 	for (int t = 0; t < VS_ZIO_TYPES; t++) {
 		vs->vs_ops[t] += cvs->vs_ops[t];
 		vs->vs_bytes[t] += cvs->vs_bytes[t];
 	}
 
 	cvs->vs_scan_removing = cvd->vdev_removing;
 }
 
 /*
  * Get extended stats
  */
 static void
 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
 {
 	(void) cvd;
 
 	int t, b;
 	for (t = 0; t < ZIO_TYPES; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
 			vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
 			vsx->vsx_total_histo[t][b] +=
 			    cvsx->vsx_total_histo[t][b];
 		}
 	}
 
 	for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
 			vsx->vsx_queue_histo[t][b] +=
 			    cvsx->vsx_queue_histo[t][b];
 		}
 		vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
 		vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
 			vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
 
 		for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
 			vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
 	}
 
 }
 
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
 		return (B_TRUE);
 
 	/*
 	 * If double-word space map entries are not enabled we assume
 	 * 47 bits of the space map entry are dedicated to the entry's
 	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
 	 * to calculate the maximum address that can be described by a
 	 * space map entry for the given device.
 	 */
 	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);
 
 	return (vd->vdev_asize < (1ULL << shift));
 }
 
 /*
  * Get statistics for the given vdev.
  */
 static void
 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	int t;
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
 	 * over all top-level vdevs (i.e. the direct children of the root).
 	 */
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		if (vs) {
 			memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
 			memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
 		}
 		if (vsx)
 			memset(vsx, 0, sizeof (*vsx));
 
 		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			vdev_stat_t *cvs = &cvd->vdev_stat;
 			vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
 
 			vdev_get_stats_ex_impl(cvd, cvs, cvsx);
 			if (vs)
 				vdev_get_child_stat(cvd, vs, cvs);
 			if (vsx)
 				vdev_get_child_stat_ex(cvd, vsx, cvsx);
 		}
 	} else {
 		/*
 		 * We're a leaf.  Just copy our ZIO active queue stats in.  The
 		 * other leaf stats are updated in vdev_stat_update().
 		 */
 		if (!vsx)
 			return;
 
 		memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
 
 		for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
 			vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
 			vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
 		}
 	}
 }
 
 void
 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
 {
 	vdev_t *tvd = vd->vdev_top;
 	mutex_enter(&vd->vdev_stat_lock);
 	if (vs) {
 		memcpy(vs, &vd->vdev_stat, sizeof (*vs));
 		vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 		vs->vs_state = vd->vdev_state;
 		vs->vs_rsize = vdev_get_min_asize(vd);
 
 		if (vd->vdev_ops->vdev_op_leaf) {
 			vs->vs_pspace = vd->vdev_psize;
 			vs->vs_rsize += VDEV_LABEL_START_SIZE +
 			    VDEV_LABEL_END_SIZE;
 			/*
 			 * Report initializing progress. Since we don't
 			 * have the initializing locks held, this is only
 			 * an estimate (although a fairly accurate one).
 			 */
 			vs->vs_initialize_bytes_done =
 			    vd->vdev_initialize_bytes_done;
 			vs->vs_initialize_bytes_est =
 			    vd->vdev_initialize_bytes_est;
 			vs->vs_initialize_state = vd->vdev_initialize_state;
 			vs->vs_initialize_action_time =
 			    vd->vdev_initialize_action_time;
 
 			/*
 			 * Report manual TRIM progress. Since we don't have
 			 * the manual TRIM locks held, this is only an
 			 * estimate (although fairly accurate one).
 			 */
 			vs->vs_trim_notsup = !vd->vdev_has_trim;
 			vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
 			vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
 			vs->vs_trim_state = vd->vdev_trim_state;
 			vs->vs_trim_action_time = vd->vdev_trim_action_time;
 
 			/* Set when there is a deferred resilver. */
 			vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
 		}
 
 		/*
 		 * Report expandable space on top-level, non-auxiliary devices
 		 * only. The expandable space is reported in terms of metaslab
 		 * sized units since that determines how much space the pool
 		 * can expand.
 		 */
 		if (vd->vdev_aux == NULL && tvd != NULL) {
 			vs->vs_esize = P2ALIGN_TYPED(
 			    vd->vdev_max_asize - vd->vdev_asize,
 			    1ULL << tvd->vdev_ms_shift, uint64_t);
 		}
 
 		vs->vs_configured_ashift = vd->vdev_top != NULL
 		    ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
 		vs->vs_logical_ashift = vd->vdev_logical_ashift;
 		if (vd->vdev_physical_ashift <= ASHIFT_MAX)
 			vs->vs_physical_ashift = vd->vdev_physical_ashift;
 		else
 			vs->vs_physical_ashift = 0;
 
 		/*
 		 * Report fragmentation and rebuild progress for top-level,
 		 * non-auxiliary, concrete devices.
 		 */
 		if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
 		    vdev_is_concrete(vd)) {
 			/*
 			 * The vdev fragmentation rating doesn't take into
 			 * account the embedded slog metaslab (vdev_log_mg).
 			 * Since it's only one metaslab, it would have a tiny
 			 * impact on the overall fragmentation.
 			 */
 			vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 			    vd->vdev_mg->mg_fragmentation : 0;
 		}
 		vs->vs_noalloc = MAX(vd->vdev_noalloc,
 		    tvd ? tvd->vdev_noalloc : 0);
 	}
 
 	vdev_get_stats_ex_impl(vd, vs, vsx);
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 {
 	return (vdev_get_stats_ex(vd, vs, NULL));
 }
 
 void
 vdev_clear_stats(vdev_t *vd)
 {
 	mutex_enter(&vd->vdev_stat_lock);
 	vd->vdev_stat.vs_space = 0;
 	vd->vdev_stat.vs_dspace = 0;
 	vd->vdev_stat.vs_alloc = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_scan_stat_init(vdev_t *vd)
 {
 	vdev_stat_t *vs = &vd->vdev_stat;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scan_stat_init(vd->vdev_child[c]);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	vs->vs_scan_processed = 0;
 	mutex_exit(&vd->vdev_stat_lock);
 }
 
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
 	spa_t *spa = zio->io_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
 /* Suppress ASAN false positive */
 #ifdef __SANITIZE_ADDRESS__
 	vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
 	vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
 #else
 	vdev_stat_t *vs = &vd->vdev_stat;
 	vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
 #endif
 	zio_type_t type = zio->io_type;
 	int flags = zio->io_flags;
 
 	/*
 	 * If this i/o is a gang leader, it didn't do any actual work.
 	 */
 	if (zio->io_gang_tree)
 		return;
 
 	if (zio->io_error == 0) {
 		/*
 		 * If this is a root i/o, don't count it -- we've already
 		 * counted the top-level vdevs, and vdev_get_stats() will
 		 * aggregate them when asked.  This reduces contention on
 		 * the root vdev_stat_lock and implicitly handles blocks
 		 * that compress away to holes, for which there is no i/o.
 		 * (Holes never create vdev children, so all the counters
 		 * remain zero, which is what we want.)
 		 *
 		 * Note: this only applies to successful i/o (io_error == 0)
 		 * because unlike i/o counts, errors are not additive.
 		 * When reading a ditto block, for example, failure of
 		 * one top-level vdev does not imply a root-level error.
 		 */
 		if (vd == rvd)
 			return;
 
 		ASSERT(vd == zio->io_vd);
 
 		if (flags & ZIO_FLAG_IO_BYPASS)
 			return;
 
 		mutex_enter(&vd->vdev_stat_lock);
 
 		if (flags & ZIO_FLAG_IO_REPAIR) {
 			/*
 			 * Repair is the result of a resilver issued by the
 			 * scan thread (spa_sync).
 			 */
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
 				dsl_scan_phys_t *scn_phys = &scn->scn_phys;
 				uint64_t *processed = &scn_phys->scn_processed;
 
 				if (vd->vdev_ops->vdev_op_leaf)
 					atomic_add_64(processed, psize);
 				vs->vs_scan_processed += psize;
 			}
 
 			/*
 			 * Repair is the result of a rebuild issued by the
 			 * rebuild thread (vdev_rebuild_thread).  To avoid
 			 * double counting repaired bytes the virtual dRAID
 			 * spare vdev is excluded from the processed bytes.
 			 */
 			if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
 				vdev_t *tvd = vd->vdev_top;
 				vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
 				vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
 				uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
 
 				if (vd->vdev_ops->vdev_op_leaf &&
 				    vd->vdev_ops != &vdev_draid_spare_ops) {
 					atomic_add_64(rebuilt, psize);
 				}
 				vs->vs_rebuild_processed += psize;
 			}
 
 			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
 		}
 
 		/*
 		 * The bytes/ops/histograms are recorded at the leaf level and
 		 * aggregated into the higher level vdevs in vdev_get_stats().
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
 			zio_type_t vs_type = type;
 			zio_priority_t priority = zio->io_priority;
 
 			/*
 			 * TRIM ops and bytes are reported to user space as
 			 * ZIO_TYPE_FLUSH.  This is done to preserve the
 			 * vdev_stat_t structure layout for user space.
 			 */
 			if (type == ZIO_TYPE_TRIM)
 				vs_type = ZIO_TYPE_FLUSH;
 
 			/*
 			 * Solely for the purposes of 'zpool iostat -lqrw'
 			 * reporting use the priority to categorize the IO.
 			 * Only the following are reported to user space:
 			 *
 			 *   ZIO_PRIORITY_SYNC_READ,
 			 *   ZIO_PRIORITY_SYNC_WRITE,
 			 *   ZIO_PRIORITY_ASYNC_READ,
 			 *   ZIO_PRIORITY_ASYNC_WRITE,
 			 *   ZIO_PRIORITY_SCRUB,
 			 *   ZIO_PRIORITY_TRIM,
 			 *   ZIO_PRIORITY_REBUILD.
 			 */
 			if (priority == ZIO_PRIORITY_INITIALIZING) {
 				ASSERT3U(type, ==, ZIO_TYPE_WRITE);
 				priority = ZIO_PRIORITY_ASYNC_WRITE;
 			} else if (priority == ZIO_PRIORITY_REMOVAL) {
 				priority = ((type == ZIO_TYPE_WRITE) ?
 				    ZIO_PRIORITY_ASYNC_WRITE :
 				    ZIO_PRIORITY_ASYNC_READ);
 			}
 
 			vs->vs_ops[vs_type]++;
 			vs->vs_bytes[vs_type] += psize;
 
 			if (flags & ZIO_FLAG_DELEGATED) {
 				vsx->vsx_agg_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			} else {
 				vsx->vsx_ind_histo[priority]
 				    [RQ_HISTO(zio->io_size)]++;
 			}
 
 			if (zio->io_delta && zio->io_delay) {
 				vsx->vsx_queue_histo[priority]
 				    [L_HISTO(zio->io_delta - zio->io_delay)]++;
 				vsx->vsx_disk_histo[type]
 				    [L_HISTO(zio->io_delay)]++;
 				vsx->vsx_total_histo[type]
 				    [L_HISTO(zio->io_delta)]++;
 			}
 		}
 
 		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
 	/*
 	 * If this is an I/O error that is going to be retried, then ignore the
 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
 	 * hard errors, when in reality they can happen for any number of
 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
 	 */
 	if (zio->io_error == EIO &&
 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 		return;
 
 	/*
 	 * Intent logs writes won't propagate their error to the root
 	 * I/O so don't mark these types of failures as pool-level
 	 * errors.
 	 */
 	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 		return;
 
 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
 	    (flags & ZIO_FLAG_SCAN_THREAD) ||
 	    spa->spa_claiming)) {
 		/*
 		 * This is either a normal write (not a repair), or it's
 		 * a repair induced by the scrub thread, or it's a repair
 		 * made by zil_claim() during spa_load() in the first txg.
 		 * In the normal case, we commit the DTL change in the same
 		 * txg as the block was born.  In the scrub-induced repair
 		 * case, we know that scrubs run in first-pass syncing context,
 		 * so we commit the DTL change in spa_syncing_txg(spa).
 		 * In the zil_claim() case, we commit in spa_first_txg(spa).
 		 *
 		 * We currently do not make DTL entries for failed spontaneous
 		 * self-healing writes triggered by normal (non-scrubbing)
 		 * reads, because we have no transactional context in which to
 		 * do so -- and it's not clear that it'd be desirable anyway.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf) {
 			uint64_t commit_txg = txg;
 			if (flags & ZIO_FLAG_SCAN_THREAD) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				ASSERT(spa_sync_pass(spa) == 1);
 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
 				commit_txg = spa_syncing_txg(spa);
 			} else if (spa->spa_claiming) {
 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
 				commit_txg = spa_first_txg(spa);
 			}
 			ASSERT(commit_txg >= spa_syncing_txg(spa));
 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
 		if (vd != rvd)
 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 int64_t
 vdev_deflated_space(vdev_t *vd, int64_t space)
 {
 	ASSERT0((space & (SPA_MINBLOCKSIZE-1)));
 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 
 	return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
 }
 
 /*
  * Update the in-core space usage stats for this vdev, its metaslab class,
  * and the root vdev.
  */
 void
 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
     int64_t space_delta)
 {
 	(void) defer_delta;
 	int64_t dspace_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
 	 * factor.  We must calculate this here and not at the root vdev
 	 * because the root vdev's psize-to-asize is simply the max of its
 	 * children's, thus not accurate enough for us.
 	 */
 	dspace_delta = vdev_deflated_space(vd, space_delta);
 
 	mutex_enter(&vd->vdev_stat_lock);
 	/* ensure we won't underflow */
 	if (alloc_delta < 0) {
 		ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
 	}
 
 	vd->vdev_stat.vs_alloc += alloc_delta;
 	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/* every class but log contributes to root space stats */
 	if (vd->vdev_mg != NULL && !vd->vdev_islog) {
 		ASSERT(!vd->vdev_isl2cache);
 		mutex_enter(&rvd->vdev_stat_lock);
 		rvd->vdev_stat.vs_alloc += alloc_delta;
 		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
 	/* Note: metaslab_class_space_update moved to metaslab_space_update */
 }
 
 /*
  * Mark a top-level vdev's config as dirty, placing it on the dirty list
  * so that it will be written out next time the vdev configuration is synced.
  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
  */
 void
 vdev_config_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int c;
 
 	ASSERT(spa_writeable(spa));
 
 	/*
 	 * If this is an aux vdev (as with l2cache and spare devices), then we
 	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
 		nvlist_t **aux;
 		uint_t naux;
 
 		for (c = 0; c < sav->sav_count; c++) {
 			if (sav->sav_vdevs[c] == vd)
 				break;
 		}
 
 		if (c == sav->sav_count) {
 			/*
 			 * We're being removed.  There's nothing more to do.
 			 */
 			ASSERT(sav->sav_sync == B_TRUE);
 			return;
 		}
 
 		sav->sav_sync = B_TRUE;
 
 		if (nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
 			VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
 			    ZPOOL_CONFIG_SPARES, &aux, &naux));
 		}
 
 		ASSERT(c < naux);
 
 		/*
 		 * Setting the nvlist in the middle if the array is a little
 		 * sketchy, but it will work.
 		 */
 		nvlist_free(aux[c]);
 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
 
 		return;
 	}
 
 	/*
 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	if (vd == rvd) {
 		for (c = 0; c < rvd->vdev_children; c++)
 			vdev_config_dirty(rvd->vdev_child[c]);
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
 		if (!list_link_active(&vd->vdev_config_dirty_node) &&
 		    vdev_is_concrete(vd)) {
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 		}
 	}
 }
 
 void
 vdev_config_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
 	list_remove(&spa->spa_config_dirty_list, vd);
 }
 
 /*
  * Mark a top-level vdev's state as dirty, so that the next pass of
  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
  * the state changes from larger config changes because they require
  * much less locking, and are often needed for administrative actions.
  */
 void
 vdev_state_dirty(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_writeable(spa));
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * The state list is protected by the SCL_STATE lock.  The caller
 	 * must either hold SCL_STATE as writer, or must be the sync thread
 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
 	 * so this is sufficient to ensure mutual exclusion.
 	 */
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	if (!list_link_active(&vd->vdev_state_dirty_node) &&
 	    vdev_is_concrete(vd))
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
 void
 vdev_state_clean(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
 	list_remove(&spa->spa_state_dirty_list, vd);
 }
 
 /*
  * Propagate vdev state up from children to parent.
  */
 void
 vdev_propagate_state(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
 			/*
 			 * Don't factor holes or indirect vdevs into the
 			 * decision.
 			 */
 			if (!vdev_is_concrete(child))
 				continue;
 
 			if (!vdev_readable(child) ||
 			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
 				 * degraded.
 				 */
 				if (child->vdev_islog && vd == rvd)
 					degraded++;
 				else
 					faulted++;
 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
 				degraded++;
 			}
 
 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
 				corrupted++;
 		}
 
 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
 
 		/*
 		 * Root special: if there is a top-level vdev that cannot be
 		 * opened due to corrupted metadata, then propagate the root
 		 * vdev's aux state as 'corrupt' rather than 'insufficient
 		 * replicas'.
 		 */
 		if (corrupted && vd == rvd &&
 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
 	}
 
 	if (vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
  * Set a vdev's state.  If this is during an open, we don't update the parent
  * state, because we're in the process of opening children depth-first.
  * Otherwise, we propagate the change to the parent.
  *
  * If this routine places a device in a faulted state, an appropriate ereport is
  * generated.
  */
 void
 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 {
 	uint64_t save_state;
 	spa_t *spa = vd->vdev_spa;
 
 	if (state == vd->vdev_state) {
 		/*
 		 * Since vdev_offline() code path is already in an offline
 		 * state we can miss a statechange event to OFFLINE. Check
 		 * the previous state to catch this condition.
 		 */
 		if (vd->vdev_ops->vdev_op_leaf &&
 		    (state == VDEV_STATE_OFFLINE) &&
 		    (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
 			/* post an offline state change */
 			zfs_post_state_change(spa, vd, vd->vdev_prevstate);
 		}
 		vd->vdev_stat.vs_aux = aux;
 		return;
 	}
 
 	save_state = vd->vdev_state;
 
 	vd->vdev_state = state;
 	vd->vdev_stat.vs_aux = aux;
 
 	/*
 	 * If we are setting the vdev state to anything but an open state, then
 	 * always close the underlying device unless the device has requested
 	 * a delayed close (i.e. we're about to remove or fault the device).
 	 * Otherwise, we keep accessible but invalid devices open forever.
 	 * We don't call vdev_close() itself, because that implies some extra
 	 * checks (offline, etc) that we don't want here.  This is limited to
 	 * leaf devices, because otherwise closing the device will affect other
 	 * children.
 	 */
 	if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
 	    vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
 		/*
 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
 		 * device was previously marked removed and someone attempted to
 		 * reopen it.  If this failed due to a nonexistent device, then
 		 * keep the device in the REMOVED state.  We also let this be if
 		 * it is one of our special test online cases, which is only
 		 * attempting to online the device and shouldn't generate an FMA
 		 * fault.
 		 */
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
 		 * If we fail to open a vdev during an import or recovery, we
 		 * mark it as "not available", which signifies that it was
 		 * never there to begin with.  Failure to open such a device
 		 * is not considered an error.
 		 */
 		if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
 		    spa_load_state(spa) == SPA_LOAD_RECOVER) &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
 		/*
 		 * Post the appropriate ereport.  If the 'prevstate' field is
 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
 		 * that this is part of a vdev_reopen().  In this case, we don't
 		 * want to post the ereport if the device was already in the
 		 * CANT_OPEN state beforehand.
 		 *
 		 * If the 'checkremove' flag is set, then this is an attempt to
 		 * online the device in response to an insertion event.  If we
 		 * hit this case, then we have detected an insertion event for a
 		 * faulted or offline device that wasn't in the removed state.
 		 * In this scenario, we don't post an ereport because we are
 		 * about to replace the device, or attempt an online with
 		 * vdev_forcefault, which will generate the fault for us.
 		 */
 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
 		    vd != spa->spa_root_vdev) {
 			const char *class;
 
 			switch (aux) {
 			case VDEV_AUX_OPEN_FAILED:
 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
 				break;
 			case VDEV_AUX_CORRUPT_DATA:
 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
 				break;
 			case VDEV_AUX_NO_REPLICAS:
 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
 				break;
 			case VDEV_AUX_BAD_GUID_SUM:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
 				break;
 			case VDEV_AUX_TOO_SMALL:
 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
 				break;
 			case VDEV_AUX_BAD_LABEL:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
 				break;
 			case VDEV_AUX_BAD_ASHIFT:
 				class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
 				break;
 			default:
 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
 			}
 
 			(void) zfs_ereport_post(class, spa, vd, NULL, NULL,
 			    save_state);
 		}
 
 		/* Erase any notion of persistent removed state */
 		vd->vdev_removed = B_FALSE;
 	} else {
 		vd->vdev_removed = B_FALSE;
 	}
 
 	/*
 	 * Notify ZED of any significant state-change on a leaf vdev.
 	 *
 	 */
 	if (vd->vdev_ops->vdev_op_leaf) {
 		/* preserve original state from a vdev_reopen() */
 		if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
 		    (vd->vdev_prevstate != vd->vdev_state) &&
 		    (save_state <= VDEV_STATE_CLOSED))
 			save_state = vd->vdev_prevstate;
 
 		/* filter out state change due to initial vdev_open */
 		if (save_state > VDEV_STATE_CLOSED)
 			zfs_post_state_change(spa, vd, save_state);
 	}
 
 	if (!isopen && vd->vdev_parent)
 		vdev_propagate_state(vd->vdev_parent);
 }
 
 boolean_t
 vdev_children_are_offline(vdev_t *vd)
 {
 	ASSERT(!vd->vdev_ops->vdev_op_leaf);
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
 			return (B_FALSE);
 	}
 
 	return (B_TRUE);
 }
 
 /*
  * Check the vdev configuration to ensure that it's capable of supporting
  * a root pool. We do not support partial configuration.
  */
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		const char *vdev_type = vd->vdev_ops->vdev_op_type;
 
 		if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
 			return (B_FALSE);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
 
 boolean_t
 vdev_is_concrete(vdev_t *vd)
 {
 	vdev_ops_t *ops = vd->vdev_ops;
 	if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
 	    ops == &vdev_missing_ops || ops == &vdev_root_ops) {
 		return (B_FALSE);
 	} else {
 		return (B_TRUE);
 	}
 }
 
 /*
  * Determine if a log device has valid content.  If the vdev was
  * removed or faulted in the MOS config then we know that
  * the content on the log device has already been written to the pool.
  */
 boolean_t
 vdev_log_state_valid(vdev_t *vd)
 {
 	if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
 	    !vd->vdev_removed)
 		return (B_TRUE);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_log_state_valid(vd->vdev_child[c]))
 			return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Expand a vdev if possible.
  */
 void
 vdev_expand(vdev_t *vd, uint64_t txg)
 {
 	ASSERT(vd->vdev_top == vd);
 	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vdev_is_concrete(vd));
 
 	vdev_set_deflate_ratio(vd);
 
 	if ((vd->vdev_spa->spa_raidz_expand == NULL ||
 	    vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
 	    (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
 	    vdev_is_concrete(vd)) {
 		vdev_metaslab_group_create(vd);
 		VERIFY0(vdev_metaslab_init(vd, txg));
 		vdev_config_dirty(vd);
 	}
 }
 
 /*
  * Split a vdev.
  */
 void
 vdev_split(vdev_t *vd)
 {
 	vdev_t *cvd, *pvd = vd->vdev_parent;
 
 	VERIFY3U(pvd->vdev_children, >, 1);
 
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	ASSERT3P(pvd->vdev_child, !=, NULL);
 
 	cvd = pvd->vdev_child[0];
 	if (pvd->vdev_children == 1) {
 		vdev_remove_parent(cvd);
 		cvd->vdev_splitting = B_TRUE;
 	}
 	vdev_propagate_state(cvd);
 }
 
 void
 vdev_deadman(vdev_t *vd, const char *tag)
 {
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 
 		vdev_deadman(cvd, tag);
 	}
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		vdev_queue_t *vq = &vd->vdev_queue;
 
 		mutex_enter(&vq->vq_lock);
 		if (vq->vq_active > 0) {
 			spa_t *spa = vd->vdev_spa;
 			zio_t *fio;
 			uint64_t delta;
 
 			zfs_dbgmsg("slow vdev: %s has %u active IOs",
 			    vd->vdev_path, vq->vq_active);
 
 			/*
 			 * Look at the head of all the pending queues,
 			 * if any I/O has been outstanding for longer than
 			 * the spa_deadman_synctime invoke the deadman logic.
 			 */
 			fio = list_head(&vq->vq_active_list);
 			delta = gethrtime() - fio->io_timestamp;
 			if (delta > spa_deadman_synctime(spa))
 				zio_deadman(fio, tag);
 		}
 		mutex_exit(&vq->vq_lock);
 	}
 }
 
 void
 vdev_defer_resilver(vdev_t *vd)
 {
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
 	vd->vdev_resilver_deferred = B_TRUE;
 	vd->vdev_spa->spa_resilver_deferred = B_TRUE;
 }
 
 /*
  * Clears the resilver deferred flag on all leaf devs under vd. Returns
  * B_TRUE if we have devices that need to be resilvered and are available to
  * accept resilver I/Os.
  */
 boolean_t
 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
 {
 	boolean_t resilver_needed = B_FALSE;
 	spa_t *spa = vd->vdev_spa;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
 	}
 
 	if (vd == spa->spa_root_vdev &&
 	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
 		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
 		vdev_config_dirty(vd);
 		spa->spa_resilver_deferred = B_FALSE;
 		return (resilver_needed);
 	}
 
 	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
 	    !vd->vdev_ops->vdev_op_leaf)
 		return (resilver_needed);
 
 	vd->vdev_resilver_deferred = B_FALSE;
 
 	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
 	    vdev_resilver_needed(vd, NULL, NULL));
 }
 
 boolean_t
 vdev_xlate_is_empty(zfs_range_seg64_t *rs)
 {
 	return (rs->rs_start == rs->rs_end);
 }
 
 /*
  * Translate a logical range to the first contiguous physical range for the
  * specified vdev_t.  This function is initially called with a leaf vdev and
  * will walk each parent vdev until it reaches a top-level vdev. Once the
  * top-level is reached the physical range is initialized and the recursive
  * function begins to unwind. As it unwinds it calls the parent's vdev
  * specific translation function to do the real conversion.
  */
 void
 vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
 {
 	/*
 	 * Walk up the vdev tree
 	 */
 	if (vd != vd->vdev_top) {
 		vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
 		    remain_rs);
 	} else {
 		/*
 		 * We've reached the top-level vdev, initialize the physical
 		 * range to the logical range and set an empty remaining
 		 * range then start to unwind.
 		 */
 		physical_rs->rs_start = logical_rs->rs_start;
 		physical_rs->rs_end = logical_rs->rs_end;
 
 		remain_rs->rs_start = logical_rs->rs_start;
 		remain_rs->rs_end = logical_rs->rs_start;
 
 		return;
 	}
 
 	vdev_t *pvd = vd->vdev_parent;
 	ASSERT3P(pvd, !=, NULL);
 	ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
 
 	/*
 	 * As this recursive function unwinds, translate the logical
 	 * range into its physical and any remaining components by calling
 	 * the vdev specific translate function.
 	 */
 	zfs_range_seg64_t intermediate = { 0 };
 	pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
 
 	physical_rs->rs_start = intermediate.rs_start;
 	physical_rs->rs_end = intermediate.rs_end;
 }
 
 void
 vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
     vdev_xlate_func_t *func, void *arg)
 {
 	zfs_range_seg64_t iter_rs = *logical_rs;
 	zfs_range_seg64_t physical_rs;
 	zfs_range_seg64_t remain_rs;
 
 	while (!vdev_xlate_is_empty(&iter_rs)) {
 
 		vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
 
 		/*
 		 * With raidz and dRAID, it's possible that the logical range
 		 * does not live on this leaf vdev. Only when there is a non-
 		 * zero physical size call the provided function.
 		 */
 		if (!vdev_xlate_is_empty(&physical_rs))
 			func(arg, &physical_rs);
 
 		iter_rs = remain_rs;
 	}
 }
 
 static char *
 vdev_name(vdev_t *vd, char *buf, int buflen)
 {
 	if (vd->vdev_path == NULL) {
 		if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
 			strlcpy(buf, vd->vdev_spa->spa_name, buflen);
 		} else if (!vd->vdev_ops->vdev_op_leaf) {
 			snprintf(buf, buflen, "%s-%llu",
 			    vd->vdev_ops->vdev_op_type,
 			    (u_longlong_t)vd->vdev_id);
 		}
 	} else {
 		strlcpy(buf, vd->vdev_path, buflen);
 	}
 	return (buf);
 }
 
 /*
  * Look at the vdev tree and determine whether any devices are currently being
  * replaced.
  */
 boolean_t
 vdev_replace_in_progress(vdev_t *vdev)
 {
 	ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
 
 	if (vdev->vdev_ops == &vdev_replacing_ops)
 		return (B_TRUE);
 
 	/*
 	 * A 'spare' vdev indicates that we have a replace in progress, unless
 	 * it has exactly two children, and the second, the hot spare, has
 	 * finished being resilvered.
 	 */
 	if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
 	    !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
 		return (B_TRUE);
 
 	for (int i = 0; i < vdev->vdev_children; i++) {
 		if (vdev_replace_in_progress(vdev->vdev_child[i]))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 static void
 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
 {
 	vdev_t *vd;
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	uint64_t objid;
 	nvlist_t *nvprops;
 
 	vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
 	nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
 	vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
 
 	/* this vdev could get removed while waiting for this sync task */
 	if (vd == NULL)
 		return;
 
 	/*
 	 * Set vdev property values in the vdev props mos object.
 	 */
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		panic("unexpected vdev type");
 	}
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval;
 		vdev_prop_t prop;
 		const char *propname = nvpair_name(elem);
 		zprop_type_t proptype;
 
 		switch (prop = vdev_name_to_prop(propname)) {
 		case VDEV_PROP_USERPROP:
 			if (vdev_prop_user(propname)) {
 				strval = fnvpair_value_string(elem);
 				if (strlen(strval) == 0) {
 					/* remove the property if value == "" */
 					(void) zap_remove(mos, objid, propname,
 					    tx);
 				} else {
 					VERIFY0(zap_update(mos, objid, propname,
 					    1, strlen(strval) + 1, strval, tx));
 				}
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			}
 			break;
 		default:
 			/* normalize the property name */
 			propname = vdev_prop_to_name(prop);
 			proptype = vdev_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos, objid, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%s",
 				    (u_longlong_t)vdev_guid, nvpair_name(elem),
 				    strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(vdev_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos, objid, propname,
 				    sizeof (uint64_t), 1, &intval, tx));
 				spa_history_log_internal(spa, "vdev set", tx,
 				    "vdev_guid=%llu: %s=%lld",
 				    (u_longlong_t)vdev_guid,
 				    nvpair_name(elem), (longlong_t)intval);
 			} else {
 				panic("invalid vdev property type %u",
 				    nvpair_type(elem));
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 int
 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvpair_t *elem = NULL;
 	uint64_t vdev_guid;
 	nvlist_t *nvprops;
 	int error = 0;
 
 	ASSERT(vd != NULL);
 
 	/* Check that vdev has a zap we can use */
 	if (vd->vdev_root_zap == 0 &&
 	    vd->vdev_top_zap == 0 &&
 	    vd->vdev_leaf_zap == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
 	    &nvprops) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
 		return (SET_ERROR(EINVAL));
 
 	while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 		vdev_prop_t prop = vdev_name_to_prop(propname);
 		uint64_t intval = 0;
 		const char *strval = NULL;
 
 		if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
 			error = EINVAL;
 			goto end;
 		}
 
 		if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) {
 			error = EROFS;
 			goto end;
 		}
 
 		/* Special Processing */
 		switch (prop) {
 		case VDEV_PROP_PATH:
 			if (vd->vdev_path == NULL) {
 				error = EROFS;
 				break;
 			}
 			if (nvpair_value_string(elem, &strval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			/* New path must start with /dev/ */
 			if (strncmp(strval, "/dev/", 5)) {
 				error = EINVAL;
 				break;
 			}
 			error = spa_vdev_setpath(spa, vdev_guid, strval);
 			break;
 		case VDEV_PROP_ALLOCATING:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			if (intval != vd->vdev_noalloc)
 				break;
 			if (intval == 0)
 				error = spa_vdev_noalloc(spa, vdev_guid);
 			else
 				error = spa_vdev_alloc(spa, vdev_guid);
 			break;
 		case VDEV_PROP_FAILFAST:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_failfast = intval & 1;
 			break;
 		case VDEV_PROP_CHECKSUM_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_n = intval;
 			break;
 		case VDEV_PROP_CHECKSUM_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_checksum_t = intval;
 			break;
 		case VDEV_PROP_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_n = intval;
 			break;
 		case VDEV_PROP_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_io_t = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_N:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_n = intval;
 			break;
 		case VDEV_PROP_SLOW_IO_T:
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = EINVAL;
 				break;
 			}
 			vd->vdev_slow_io_t = intval;
 			break;
 		default:
 			/* Most processing is done in vdev_props_set_sync */
 			break;
 		}
 end:
 		if (error != 0) {
 			intval = error;
 			vdev_prop_add_list(outnvl, propname, strval, intval, 0);
 			return (error);
 		}
 	}
 
 	return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
 	    innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
 int
 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	int err = 0;
 	uint64_t objid;
 	uint64_t vdev_guid;
 	nvpair_t *elem = NULL;
 	nvlist_t *nvprops = NULL;
 	uint64_t intval = 0;
 	char *strval = NULL;
 	const char *propname = NULL;
 	vdev_prop_t prop;
 
 	ASSERT(vd != NULL);
 	ASSERT(mos != NULL);
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
 
 	if (vd->vdev_root_zap != 0) {
 		objid = vd->vdev_root_zap;
 	} else if (vd->vdev_top_zap != 0) {
 		objid = vd->vdev_top_zap;
 	} else if (vd->vdev_leaf_zap != 0) {
 		objid = vd->vdev_leaf_zap;
 	} else {
 		return (SET_ERROR(EINVAL));
 	}
 	ASSERT(objid != 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	if (nvprops != NULL) {
 		char namebuf[64] = { 0 };
 
 		while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
 			intval = 0;
 			strval = NULL;
 			propname = nvpair_name(elem);
 			prop = vdev_name_to_prop(propname);
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			uint64_t integer_size, num_integers;
 
 			switch (prop) {
 			/* Special Read-only Properties */
 			case VDEV_PROP_NAME:
 				strval = vdev_name(vd, namebuf,
 				    sizeof (namebuf));
 				if (strval == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CAPACITY:
 				/* percent used */
 				intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
 				    (vd->vdev_stat.vs_alloc * 100 /
 				    vd->vdev_stat.vs_dspace);
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_STATE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_state, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_GUID:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_guid, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_asize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PSIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_psize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ASHIFT:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_ashift, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SIZE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_dspace -
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ALLOCATED:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_EXPANDSZ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRAGMENTATION:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_fragmentation,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARITY:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vdev_get_nparity(vd), ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PATH:
 				if (vd->vdev_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_DEVID:
 				if (vd->vdev_devid == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_devid, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PHYS_PATH:
 				if (vd->vdev_physpath == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_physpath, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_ENC_PATH:
 				if (vd->vdev_enc_sysfs_path == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_FRU:
 				if (vd->vdev_fru == NULL)
 					continue;
 				vdev_prop_add_list(outnvl, propname,
 				    vd->vdev_fru, 0, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_PARENT:
 				if (vd->vdev_parent != NULL) {
 					strval = vdev_name(vd->vdev_parent,
 					    namebuf, sizeof (namebuf));
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_CHILDREN:
 				if (vd->vdev_children > 0)
 					strval = kmem_zalloc(ZAP_MAXVALUELEN,
 					    KM_SLEEP);
 				for (uint64_t i = 0; i < vd->vdev_children;
 				    i++) {
 					const char *vname;
 
 					vname = vdev_name(vd->vdev_child[i],
 					    namebuf, sizeof (namebuf));
 					if (vname == NULL)
 						vname = "(unknown)";
 					if (strlen(strval) > 0)
 						strlcat(strval, ",",
 						    ZAP_MAXVALUELEN);
 					strlcat(strval, vname, ZAP_MAXVALUELEN);
 				}
 				if (strval != NULL) {
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, ZPROP_SRC_NONE);
 					kmem_free(strval, ZAP_MAXVALUELEN);
 				}
 				continue;
 			case VDEV_PROP_NUMCHILDREN:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_children, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_READ_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_read_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_WRITE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_write_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_CHECKSUM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_checksum_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_INITIALIZE_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_initialize_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_TRIM_ERRORS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_trim_errors,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_SLOW_IOS:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_slow_ios,
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_OPS_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_NULL:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_READ:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_WRITE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_FREE:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_CLAIM:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_BYTES_TRIM:
 				/*
 				 * TRIM ops and bytes are reported to user
 				 * space as ZIO_TYPE_FLUSH.  This is done to
 				 * preserve the vdev_stat_t structure layout
 				 * for user space.
 				 */
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
 				    ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_REMOVING:
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    vd->vdev_removing, ZPROP_SRC_NONE);
 				continue;
 			case VDEV_PROP_RAIDZ_EXPANDING:
 				/* Only expose this for raidz */
 				if (vd->vdev_ops == &vdev_raidz_ops) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_rz_expanding,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			case VDEV_PROP_TRIM_SUPPORT:
 				/* only valid for leaf vdevs */
 				if (vd->vdev_ops->vdev_op_leaf) {
 					vdev_prop_add_list(outnvl, propname,
 					    NULL, vd->vdev_has_trim,
 					    ZPROP_SRC_NONE);
 				}
 				continue;
 			/* Numeric Properites */
 			case VDEV_PROP_ALLOCATING:
 				/* Leaf vdevs cannot have this property */
 				if (vd->vdev_mg == NULL &&
 				    vd->vdev_top != NULL) {
 					src = ZPROP_SRC_NONE;
 					intval = ZPROP_BOOLEAN_NA;
 				} else {
 					err = vdev_prop_get_int(vd, prop,
 					    &intval);
 					if (err && err != ENOENT)
 						break;
 
 					if (intval ==
 					    vdev_prop_default_numeric(prop))
 						src = ZPROP_SRC_DEFAULT;
 					else
 						src = ZPROP_SRC_LOCAL;
 				}
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			case VDEV_PROP_FAILFAST:
 				src = ZPROP_SRC_LOCAL;
 				strval = NULL;
 
 				err = zap_lookup(mos, objid, nvpair_name(elem),
 				    sizeof (uint64_t), 1, &intval);
 				if (err == ENOENT) {
 					intval = vdev_prop_default_numeric(
 					    prop);
 					err = 0;
 				} else if (err) {
 					break;
 				}
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 
 				vdev_prop_add_list(outnvl, propname, strval,
 				    intval, src);
 				break;
 			case VDEV_PROP_CHECKSUM_N:
 			case VDEV_PROP_CHECKSUM_T:
 			case VDEV_PROP_IO_N:
 			case VDEV_PROP_IO_T:
 			case VDEV_PROP_SLOW_IO_N:
 			case VDEV_PROP_SLOW_IO_T:
 				err = vdev_prop_get_int(vd, prop, &intval);
 				if (err && err != ENOENT)
 					break;
 
 				if (intval == vdev_prop_default_numeric(prop))
 					src = ZPROP_SRC_DEFAULT;
 				else
 					src = ZPROP_SRC_LOCAL;
 
 				vdev_prop_add_list(outnvl, propname, NULL,
 				    intval, src);
 				break;
 			/* Text Properties */
 			case VDEV_PROP_COMMENT:
 				/* Exists in the ZAP below */
 				/* FALLTHRU */
 			case VDEV_PROP_USERPROP:
 				/* User Properites */
 				src = ZPROP_SRC_LOCAL;
 
 				err = zap_length(mos, objid, nvpair_name(elem),
 				    &integer_size, &num_integers);
 				if (err)
 					break;
 
 				switch (integer_size) {
 				case 8:
 					/* User properties cannot be integers */
 					err = EINVAL;
 					break;
 				case 1:
 					/* string property */
 					strval = kmem_alloc(num_integers,
 					    KM_SLEEP);
 					err = zap_lookup(mos, objid,
 					    nvpair_name(elem), 1,
 					    num_integers, strval);
 					if (err) {
 						kmem_free(strval,
 						    num_integers);
 						break;
 					}
 					vdev_prop_add_list(outnvl, propname,
 					    strval, 0, src);
 					kmem_free(strval, num_integers);
 					break;
 				}
 				break;
 			default:
 				err = ENOENT;
 				break;
 			}
 			if (err)
 				break;
 		}
 	} else {
 		/*
 		 * Get all properties from the MOS vdev property object.
 		 */
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 		for (zap_cursor_init(&zc, mos, objid);
 		    (err = zap_cursor_retrieve(&zc, za)) == 0;
 		    zap_cursor_advance(&zc)) {
 			intval = 0;
 			strval = NULL;
 			zprop_source_t src = ZPROP_SRC_DEFAULT;
 			propname = za->za_name;
 
 			switch (za->za_integer_length) {
 			case 8:
 				/* We do not allow integer user properties */
 				/* This is likely an internal value */
 				break;
 			case 1:
 				/* string property */
 				strval = kmem_alloc(za->za_num_integers,
 				    KM_SLEEP);
 				err = zap_lookup(mos, objid, za->za_name, 1,
 				    za->za_num_integers, strval);
 				if (err) {
 					kmem_free(strval, za->za_num_integers);
 					break;
 				}
 				vdev_prop_add_list(outnvl, propname, strval, 0,
 				    src);
 				kmem_free(strval, za->za_num_integers);
 				break;
 
 			default:
 				break;
 			}
 		}
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 	if (err && err != ENOENT) {
 		return (err);
 	}
 
 	return (0);
 }
 
 EXPORT_SYMBOL(vdev_fault);
 EXPORT_SYMBOL(vdev_degrade);
 EXPORT_SYMBOL(vdev_online);
 EXPORT_SYMBOL(vdev_offline);
 EXPORT_SYMBOL(vdev_clear);
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
 	"Target number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
 	"Default lower limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
 	"Default upper limit for metaslab size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
 	"Minimum number of metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
 	"Practical upper limit of total metaslabs per top-level vdev");
 
 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
 	"Rate limit slow IO (delay) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
 	"Rate limit hung IO (deadman) events to this many per second");
 
 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
 	"Rate Direct I/O write verify events to this many per second");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
 	"Direct I/O writes will perform for checksum verification before "
 	"commiting write");
 
 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
 	"Rate limit checksum events to this many checksum errors per second "
 	"(do not set below ZED threshold).");
 
 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
 	"Ignore errors during resilver/scrub");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
 	"Bypass vdev_validate()");
 
 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
 	"Disable cache flushes");
 
 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
 	"Minimum number of metaslabs required to dedicate one for log blocks");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
 	param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
 	"Minimum ashift used when creating new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
 	param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
 	"Maximum ashift used when optimizing for logical -> physical sector "
 	"size on new top-level vdevs");
 
 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl,
 		param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW,
 		"RAIDZ implementation");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_queue.c b/sys/contrib/openzfs/module/zfs/vdev_queue.c
index c12713b107bf..e69e5598939e 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_queue.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_queue.c
@@ -1,1145 +1,1145 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab_impl.h>
 #include <sys/spa.h>
 #include <sys/abd.h>
 
 /*
  * ZFS I/O Scheduler
  * ---------------
  *
  * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
  * I/O scheduler determines when and in what order those operations are
  * issued.  The I/O scheduler divides operations into five I/O classes
  * prioritized in the following order: sync read, sync write, async read,
  * async write, and scrub/resilver.  Each queue defines the minimum and
  * maximum number of concurrent operations that may be issued to the device.
  * In addition, the device has an aggregate maximum. Note that the sum of the
  * per-queue minimums must not exceed the aggregate maximum. If the
  * sum of the per-queue maximums exceeds the aggregate maximum, then the
  * number of active i/os may reach zfs_vdev_max_active, in which case no
  * further i/os will be issued regardless of whether all per-queue
  * minimums have been met.
  *
  * For many physical devices, throughput increases with the number of
  * concurrent operations, but latency typically suffers. Further, physical
  * devices typically have a limit at which more concurrent operations have no
  * effect on throughput or can actually cause it to decrease.
  *
  * The scheduler selects the next operation to issue by first looking for an
  * I/O class whose minimum has not been satisfied. Once all are satisfied and
  * the aggregate maximum has not been hit, the scheduler looks for classes
  * whose maximum has not been satisfied. Iteration through the I/O classes is
  * done in the order specified above. No further operations are issued if the
  * aggregate maximum number of concurrent operations has been hit or if there
  * are no operations queued for an I/O class that has not hit its maximum.
  * Every time an i/o is queued or an operation completes, the I/O scheduler
  * looks for new operations to issue.
  *
  * All I/O classes have a fixed maximum number of outstanding operations
  * except for the async write class. Asynchronous writes represent the data
  * that is committed to stable storage during the syncing stage for
  * transaction groups (see txg.c). Transaction groups enter the syncing state
  * periodically so the number of queued async writes will quickly burst up and
  * then bleed down to zero. Rather than servicing them as quickly as possible,
  * the I/O scheduler changes the maximum number of active async write i/os
  * according to the amount of dirty data in the pool (see dsl_pool.c). Since
  * both throughput and latency typically increase with the number of
  * concurrent operations issued to physical devices, reducing the burstiness
  * in the number of concurrent operations also stabilizes the response time of
  * operations from other -- and in particular synchronous -- queues. In broad
  * strokes, the I/O scheduler will issue more concurrent operations from the
  * async write queue as there's more dirty data in the pool.
  *
  * Async Writes
  *
  * The number of concurrent operations issued for the async write I/O class
  * follows a piece-wise linear function defined by a few adjustable points.
  *
  *        |                   o---------| <-- zfs_vdev_async_write_max_active
  *   ^    |                  /^         |
  *   |    |                 / |         |
  * active |                /  |         |
  *  I/O   |               /   |         |
  * count  |              /    |         |
  *        |             /     |         |
  *        |------------o      |         | <-- zfs_vdev_async_write_min_active
  *       0|____________^______|_________|
  *        0%           |      |       100% of zfs_dirty_data_max
  *                     |      |
  *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
  *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
  *
  * Until the amount of dirty data exceeds a minimum percentage of the dirty
  * data allowed in the pool, the I/O scheduler will limit the number of
  * concurrent operations to the minimum. As that threshold is crossed, the
  * number of concurrent operations issued increases linearly to the maximum at
  * the specified maximum percentage of the dirty data allowed in the pool.
  *
  * Ideally, the amount of dirty data on a busy pool will stay in the sloped
  * part of the function between zfs_vdev_async_write_active_min_dirty_percent
  * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
  * maximum percentage, this indicates that the rate of incoming data is
  * greater than the rate that the backend storage can handle. In this case, we
  * must further throttle incoming writes (see dmu_tx_delay() for details).
  */
 
 /*
  * The maximum number of i/os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.
  */
-uint_t zfs_vdev_max_active = 1000;
+static uint_t zfs_vdev_max_active = 1000;
 
 /*
  * Per-queue limits on the number of i/os active to each device.  If the
  * number of active i/os is < zfs_vdev_max_active, then the min_active comes
  * into play.  We will send min_active from each queue round-robin, and then
  * send from queues in the order defined by zio_priority_t up to max_active.
  * Some queues have additional mechanisms to limit number of active I/Os in
  * addition to min_active and max_active, see below.
  *
  * In general, smaller max_active's will lead to lower latency of synchronous
  * operations.  Larger max_active's may lead to higher overall throughput,
  * depending on underlying storage.
  *
  * The ratio of the queues' max_actives determines the balance of performance
  * between reads, writes, and scrubs.  E.g., increasing
  * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
  * more quickly, but reads and writes to have higher latency and lower
  * throughput.
  */
 static uint_t zfs_vdev_sync_read_min_active = 10;
 static uint_t zfs_vdev_sync_read_max_active = 10;
 static uint_t zfs_vdev_sync_write_min_active = 10;
 static uint_t zfs_vdev_sync_write_max_active = 10;
 static uint_t zfs_vdev_async_read_min_active = 1;
 /*  */ uint_t zfs_vdev_async_read_max_active = 3;
 static uint_t zfs_vdev_async_write_min_active = 2;
 static uint_t zfs_vdev_async_write_max_active = 10;
 static uint_t zfs_vdev_scrub_min_active = 1;
 static uint_t zfs_vdev_scrub_max_active = 3;
 static uint_t zfs_vdev_removal_min_active = 1;
 static uint_t zfs_vdev_removal_max_active = 2;
 static uint_t zfs_vdev_initializing_min_active = 1;
 static uint_t zfs_vdev_initializing_max_active = 1;
 static uint_t zfs_vdev_trim_min_active = 1;
 static uint_t zfs_vdev_trim_max_active = 2;
 static uint_t zfs_vdev_rebuild_min_active = 1;
 static uint_t zfs_vdev_rebuild_max_active = 3;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
  * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
  * zfs_vdev_async_write_active_max_dirty_percent, use
  * zfs_vdev_async_write_max_active. The value is linearly interpolated
  * between min and max.
  */
 uint_t zfs_vdev_async_write_active_min_dirty_percent = 30;
 uint_t zfs_vdev_async_write_active_max_dirty_percent = 60;
 
 /*
  * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
  * the number of concurrently-active I/O's is limited to *_min_active, unless
  * the vdev is "idle".  When there are no interactive I/Os active (sync or
  * async), and zfs_vdev_nia_delay I/Os have completed since the last
  * interactive I/O, then the vdev is considered to be "idle", and the number
  * of concurrently-active non-interactive I/O's is increased to *_max_active.
  */
 static uint_t zfs_vdev_nia_delay = 5;
 
 /*
  * Some HDDs tend to prioritize sequential I/O so high that concurrent
  * random I/O latency reaches several seconds.  On some HDDs it happens
  * even if sequential I/Os are submitted one at a time, and so setting
  * *_max_active to 1 does not help.  To prevent non-interactive I/Os, like
  * scrub, from monopolizing the device no more than zfs_vdev_nia_credit
  * I/Os can be sent while there are outstanding incomplete interactive
  * I/Os.  This enforced wait ensures the HDD services the interactive I/O
  * within a reasonable amount of time.
  */
 static uint_t zfs_vdev_nia_credit = 5;
 
 /*
  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
  * For read I/Os, we also aggregate across small adjacency gaps; for writes
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
 static uint_t zfs_vdev_aggregation_limit = 1 << 20;
 static uint_t zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
 static uint_t zfs_vdev_read_gap_limit = 32 << 10;
 static uint_t zfs_vdev_write_gap_limit = 4 << 10;
 
 static int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
 	int cmp = TREE_CMP(z1->io_offset, z2->io_offset);
 
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
 #define	VDQ_T_SHIFT 29
 
 static int
 vdev_queue_to_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
 	int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
 	    z2->io_timestamp >> VDQ_T_SHIFT);
 	int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
 	int cmp = tcmp ? tcmp : ocmp;
 
 	if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
 static inline boolean_t
 vdev_queue_class_fifo(zio_priority_t p)
 {
 	return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
 	    p == ZIO_PRIORITY_TRIM);
 }
 
 static void
 vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
 {
 	zio_priority_t p = zio->io_priority;
 	vq->vq_cqueued |= 1U << p;
 	if (vdev_queue_class_fifo(p)) {
 		list_insert_tail(&vq->vq_class[p].vqc_list, zio);
 		vq->vq_class[p].vqc_list_numnodes++;
 	}
 	else
 		avl_add(&vq->vq_class[p].vqc_tree, zio);
 }
 
 static void
 vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	zio_priority_t p = zio->io_priority;
 	uint32_t empty;
 	if (vdev_queue_class_fifo(p)) {
 		list_t *list = &vq->vq_class[p].vqc_list;
 		list_remove(list, zio);
 		empty = list_is_empty(list);
 		vq->vq_class[p].vqc_list_numnodes--;
 	} else {
 		avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
 		avl_remove(tree, zio);
 		empty = avl_is_empty(tree);
 	}
 	vq->vq_cqueued &= ~(empty << p);
 }
 
 static uint_t
 vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
 		return (zfs_vdev_sync_read_min_active);
 	case ZIO_PRIORITY_SYNC_WRITE:
 		return (zfs_vdev_sync_write_min_active);
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_min_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (zfs_vdev_async_write_min_active);
 	case ZIO_PRIORITY_SCRUB:
 		return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
 		    MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
 	case ZIO_PRIORITY_REMOVAL:
 		return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
 		    MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
 	case ZIO_PRIORITY_INITIALIZING:
 		return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
 		    MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_min_active);
 	case ZIO_PRIORITY_REBUILD:
 		return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
 		    MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
 	default:
 		panic("invalid priority %u", p);
 		return (0);
 	}
 }
 
 static uint_t
 vdev_queue_max_async_writes(spa_t *spa)
 {
 	uint_t writes;
 	uint64_t dirty = 0;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint64_t max_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_max_dirty_percent / 100;
 
 	/*
 	 * Async writes may occur before the assignment of the spa's
 	 * dsl_pool_t if a self-healing zio is issued prior to the
 	 * completion of dmu_objset_open_impl().
 	 */
 	if (dp == NULL)
 		return (zfs_vdev_async_write_max_active);
 
 	/*
 	 * Sync tasks correspond to interactive user actions. To reduce the
 	 * execution time of those actions we push data out as fast as possible.
 	 */
 	dirty = dp->dp_dirty_total;
 	if (dirty > max_bytes || spa_has_pending_synctask(spa))
 		return (zfs_vdev_async_write_max_active);
 
 	if (dirty < min_bytes)
 		return (zfs_vdev_async_write_min_active);
 
 	/*
 	 * linear interpolation:
 	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
 	 * move right by min_bytes
 	 * move up by min_writes
 	 */
 	writes = (dirty - min_bytes) *
 	    (zfs_vdev_async_write_max_active -
 	    zfs_vdev_async_write_min_active) /
 	    (max_bytes - min_bytes) +
 	    zfs_vdev_async_write_min_active;
 	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
 	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
 	return (writes);
 }
 
 static uint_t
 vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
 		return (zfs_vdev_sync_read_max_active);
 	case ZIO_PRIORITY_SYNC_WRITE:
 		return (zfs_vdev_sync_write_max_active);
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
 	case ZIO_PRIORITY_SCRUB:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_scrub_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_scrub_min_active));
 		return (zfs_vdev_scrub_max_active);
 	case ZIO_PRIORITY_REMOVAL:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_removal_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_removal_min_active));
 		return (zfs_vdev_removal_max_active);
 	case ZIO_PRIORITY_INITIALIZING:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_initializing_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_initializing_min_active));
 		return (zfs_vdev_initializing_max_active);
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_max_active);
 	case ZIO_PRIORITY_REBUILD:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_rebuild_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_rebuild_min_active));
 		return (zfs_vdev_rebuild_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
 	}
 }
 
 /*
  * Return the i/o class to issue from, or ZIO_PRIORITY_NUM_QUEUEABLE if
  * there is no eligible class.
  */
 static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
 	uint32_t cq = vq->vq_cqueued;
 	zio_priority_t p, p1;
 
 	if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	/*
 	 * Find a queue that has not reached its minimum # outstanding i/os.
 	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
 	 * and vq_nia_credit limits.
 	 */
 	p1 = vq->vq_last_prio + 1;
 	if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
 		p1 = 0;
 	for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
 		    vdev_queue_class_min_active(vq, p))
 			goto found;
 	}
 	for (p = 0; p < p1; p++) {
 		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
 		    vdev_queue_class_min_active(vq, p))
 			goto found;
 	}
 
 	/*
 	 * If we haven't found a queue, look for one that hasn't reached its
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
 		    vdev_queue_class_max_active(vq, p))
 			break;
 	}
 
 found:
 	vq->vq_last_prio = p;
 	return (p);
 }
 
 void
 vdev_queue_init(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 	zio_priority_t p;
 
 	vq->vq_vdev = vd;
 
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if (vdev_queue_class_fifo(p)) {
 			list_create(&vq->vq_class[p].vqc_list,
 			    sizeof (zio_t),
 			    offsetof(struct zio, io_queue_node.l));
 		} else {
 			avl_create(&vq->vq_class[p].vqc_tree,
 			    vdev_queue_to_compare, sizeof (zio_t),
 			    offsetof(struct zio, io_queue_node.a));
 		}
 	}
 	avl_create(&vq->vq_read_offset_tree,
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 	avl_create(&vq->vq_write_offset_tree,
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 
 	vq->vq_last_offset = 0;
 	list_create(&vq->vq_active_list, sizeof (struct zio),
 	    offsetof(struct zio, io_queue_node.l));
 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
 vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if (vdev_queue_class_fifo(p))
 			list_destroy(&vq->vq_class[p].vqc_list);
 		else
 			avl_destroy(&vq->vq_class[p].vqc_tree);
 	}
 	avl_destroy(&vq->vq_read_offset_tree);
 	avl_destroy(&vq->vq_write_offset_tree);
 
 	list_destroy(&vq->vq_active_list);
 	mutex_destroy(&vq->vq_lock);
 }
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
 	zio->io_queue_state = ZIO_QS_QUEUED;
 	vdev_queue_class_add(vq, zio);
 	if (zio->io_type == ZIO_TYPE_READ)
 		avl_add(&vq->vq_read_offset_tree, zio);
 	else if (zio->io_type == ZIO_TYPE_WRITE)
 		avl_add(&vq->vq_write_offset_tree, zio);
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	vdev_queue_class_remove(vq, zio);
 	if (zio->io_type == ZIO_TYPE_READ)
 		avl_remove(&vq->vq_read_offset_tree, zio);
 	else if (zio->io_type == ZIO_TYPE_WRITE)
 		avl_remove(&vq->vq_write_offset_tree, zio);
 	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static boolean_t
 vdev_queue_is_interactive(zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SCRUB:
 	case ZIO_PRIORITY_REMOVAL:
 	case ZIO_PRIORITY_INITIALIZING:
 	case ZIO_PRIORITY_REBUILD:
 		return (B_FALSE);
 	default:
 		return (B_TRUE);
 	}
 }
 
 static void
 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_cactive[zio->io_priority]++;
 	vq->vq_active++;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (++vq->vq_ia_active == 1)
 			vq->vq_nia_credit = 1;
 	} else if (vq->vq_ia_active > 0) {
 		vq->vq_nia_credit--;
 	}
 	zio->io_queue_state = ZIO_QS_ACTIVE;
 	list_insert_tail(&vq->vq_active_list, zio);
 }
 
 static void
 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_cactive[zio->io_priority]--;
 	vq->vq_active--;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (--vq->vq_ia_active == 0)
 			vq->vq_nia_credit = 0;
 		else
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
 	list_remove(&vq->vq_active_list, zio);
 	zio->io_queue_state = ZIO_QS_NONE;
 }
 
 static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
 	abd_free(aio->io_abd);
 }
 
 /*
  * Compute the range spanned by two i/os, which is the endpoint of the last
  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
  * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
  */
 #define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
 #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 /*
  * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this
  * by creating a gang ABD from the adjacent ZIOs io_abd's. By using
  * a gang ABD we avoid doing memory copies to and from the parent,
  * child ZIOs. The gang ABD also accounts for gaps between adjacent
  * io_offsets by simply getting the zero ABD for writes or allocating
  * a new ABD for reads and placing them in the gang ABD as well.
  */
 static zio_t *
 vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 {
 	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
 	uint64_t maxgap = 0;
 	uint64_t size;
 	uint64_t limit;
 	boolean_t stretch = B_FALSE;
 	uint64_t next_offset;
 	abd_t *abd;
 	avl_tree_t *t;
 
 	/*
 	 * TRIM aggregation should not be needed since code in zfs_trim.c can
 	 * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
 	 */
 	if (zio->io_type == ZIO_TYPE_TRIM)
 		return (NULL);
 
 	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
 		return (NULL);
 
 	if (vq->vq_vdev->vdev_nonrot)
 		limit = zfs_vdev_aggregation_limit_non_rotating;
 	else
 		limit = zfs_vdev_aggregation_limit;
 	if (limit == 0)
 		return (NULL);
 	limit = MIN(limit, SPA_MAXBLOCKSIZE);
 
 	/*
 	 * I/Os to distributed spares are directly dispatched to the dRAID
 	 * leaf vdevs for aggregation.  See the comment at the end of the
 	 * zio_vdev_io_start() function.
 	 */
 	ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
 
 	first = last = zio;
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		maxgap = zfs_vdev_read_gap_limit;
 		t = &vq->vq_read_offset_tree;
 	} else {
 		ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
 		t = &vq->vq_write_offset_tree;
 	}
 
 	/*
 	 * We can aggregate I/Os that are sufficiently adjacent and of
 	 * the same flavor, as expressed by the AGG_INHERIT flags.
 	 * The latter requirement is necessary so that certain
 	 * attributes of the I/O, such as whether it's a normal I/O
 	 * or a scrub/resilver, can be preserved in the aggregate.
 	 * We can include optional I/Os, but don't allow them
 	 * to begin a range as they add no benefit in that situation.
 	 */
 
 	/*
 	 * We keep track of the last non-optional I/O.
 	 */
 	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 
 	/*
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-optional I/O.
 	 */
 	zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= limit &&
 	    IO_GAP(dio, first) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		first = dio;
 		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
 			mandatory = first;
 	}
 
 	/*
 	 * Skip any initial optional I/Os.
 	 */
 	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
 		first = AVL_NEXT(t, first);
 		ASSERT(first != NULL);
 	}
 
 
 	/*
 	 * Walk forward through sufficiently contiguous I/Os.
 	 * The aggregation limit does not apply to optional i/os, so that
 	 * we can issue contiguous writes even if they are larger than the
 	 * aggregation limit.
 	 */
 	while ((dio = AVL_NEXT(t, last)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    (IO_SPAN(first, dio) <= limit ||
 	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
 	    IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
 	    IO_GAP(last, dio) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		last = dio;
 		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
 			mandatory = last;
 	}
 
 	/*
 	 * Now that we've established the range of the I/O aggregation
 	 * we must decide what to do with trailing optional I/Os.
 	 * For reads, there's nothing to do. While we are unable to
 	 * aggregate further, it's possible that a trailing optional
 	 * I/O would allow the underlying device to aggregate with
 	 * subsequent I/Os. We must therefore determine if the next
 	 * non-optional I/O is close enough to make aggregation
 	 * worthwhile.
 	 */
 	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
 		zio_t *nio = last;
 		while ((dio = AVL_NEXT(t, nio)) != NULL &&
 		    IO_GAP(nio, dio) == 0 &&
 		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
 			nio = dio;
 			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
 				stretch = B_TRUE;
 				break;
 			}
 		}
 	}
 
 	if (stretch) {
 		/*
 		 * We are going to include an optional io in our aggregated
 		 * span, thus closing the write gap.  Only mandatory i/os can
 		 * start aggregated spans, so make sure that the next i/o
 		 * after our span is mandatory.
 		 */
 		dio = AVL_NEXT(t, last);
 		ASSERT3P(dio, !=, NULL);
 		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 	} else {
 		/* do not include the optional i/o */
 		while (last != mandatory && last != first) {
 			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
 			last = AVL_PREV(t, last);
 			ASSERT(last != NULL);
 		}
 	}
 
 	if (first == last)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 
 	abd = abd_alloc_gang();
 	if (abd == NULL)
 		return (NULL);
 
 	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 	    abd, size, first->io_type, zio->io_priority,
 	    flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL);
 	aio->io_timestamp = first->io_timestamp;
 
 	nio = first;
 	next_offset = first->io_offset;
 	do {
 		dio = nio;
 		nio = AVL_NEXT(t, dio);
 		ASSERT3P(dio, !=, NULL);
 		zio_add_child(dio, aio);
 		vdev_queue_io_remove(vq, dio);
 
 		if (dio->io_offset != next_offset) {
 			/* allocate a buffer for a read gap */
 			ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ);
 			ASSERT3U(dio->io_offset, >, next_offset);
 			abd = abd_alloc_for_io(
 			    dio->io_offset - next_offset, B_TRUE);
 			abd_gang_add(aio->io_abd, abd, B_TRUE);
 		}
 		if (dio->io_abd &&
 		    (dio->io_size != abd_get_size(dio->io_abd))) {
 			/* abd size not the same as IO size */
 			ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size);
 			abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size);
 			abd_gang_add(aio->io_abd, abd, B_TRUE);
 		} else {
 			if (dio->io_flags & ZIO_FLAG_NODATA) {
 				/* allocate a buffer for a write gap */
 				ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
 				ASSERT0P(dio->io_abd);
 				abd_gang_add(aio->io_abd,
 				    abd_get_zeros(dio->io_size), B_TRUE);
 			} else {
 				/*
 				 * We pass B_FALSE to abd_gang_add()
 				 * because we did not allocate a new
 				 * ABD, so it is assumed the caller
 				 * will free this ABD.
 				 */
 				abd_gang_add(aio->io_abd, dio->io_abd,
 				    B_FALSE);
 			}
 		}
 		next_offset = dio->io_offset + dio->io_size;
 	} while (dio != last);
 	ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size);
 
 	/*
 	 * Callers must call zio_vdev_io_bypass() and zio_execute() for
 	 * aggregated (parent) I/Os so that we could avoid dropping the
 	 * queue's lock here to avoid a deadlock that we could encounter
 	 * due to lock order reversal between vq_lock and io_lock in
 	 * zio_change_priority().
 	 */
 	return (aio);
 }
 
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq)
 {
 	zio_t *zio, *aio;
 	zio_priority_t p;
 	avl_index_t idx;
 	avl_tree_t *tree;
 
 again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	p = vdev_queue_class_to_issue(vq);
 
 	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
 		/* No eligible queued i/os */
 		return (NULL);
 	}
 
 	if (vdev_queue_class_fifo(p)) {
 		zio = list_head(&vq->vq_class[p].vqc_list);
 	} else {
 		/*
 		 * For LBA-ordered queues (async / scrub / initializing),
 		 * issue the I/O which follows the most recently issued I/O
 		 * in LBA (offset) order, but to avoid starvation only within
 		 * the same 0.5 second interval as the first I/O.
 		 */
 		tree = &vq->vq_class[p].vqc_tree;
 		zio = aio = avl_first(tree);
 		if (zio->io_offset < vq->vq_last_offset) {
 			vq->vq_io_search.io_timestamp = zio->io_timestamp;
 			vq->vq_io_search.io_offset = vq->vq_last_offset;
 			zio = avl_find(tree, &vq->vq_io_search, &idx);
 			if (zio == NULL) {
 				zio = avl_nearest(tree, idx, AVL_AFTER);
 				if (zio == NULL ||
 				    (zio->io_timestamp >> VDQ_T_SHIFT) !=
 				    (aio->io_timestamp >> VDQ_T_SHIFT))
 					zio = aio;
 			}
 		}
 	}
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);
 	if (aio != NULL) {
 		zio = aio;
 	} else {
 		vdev_queue_io_remove(vq, zio);
 
 		/*
 		 * If the I/O is or was optional and therefore has no data, we
 		 * need to simply discard it. We need to drop the vdev queue's
 		 * lock to avoid a deadlock that we could encounter since this
 		 * I/O will complete immediately.
 		 */
 		if (zio->io_flags & ZIO_FLAG_NODATA) {
 			mutex_exit(&vq->vq_lock);
 			zio_vdev_io_bypass(zio);
 			zio_execute(zio);
 			mutex_enter(&vq->vq_lock);
 			goto again;
 		}
 	}
 
 	vdev_queue_pending_add(vq, zio);
 	vq->vq_last_offset = zio->io_offset + zio->io_size;
 
 	return (zio);
 }
 
 zio_t *
 vdev_queue_io(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *dio, *nio;
 	zio_link_t *zl = NULL;
 
 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 		return (zio);
 
 	/*
 	 * Children i/os inherent their parent's priority, which might
 	 * not match the child's i/o type.  Fix it up here.
 	 */
 	if (zio->io_type == ZIO_TYPE_READ) {
 		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
 
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
 		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
 		    zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
 		    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
 
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
 		    zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
 		    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_TRIM);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
 	}
 
 	zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
 	zio->io_timestamp = gethrtime();
 
 	mutex_enter(&vq->vq_lock);
 	vdev_queue_io_add(vq, zio);
 	nio = vdev_queue_io_to_issue(vq);
 	mutex_exit(&vq->vq_lock);
 
 	if (nio == NULL)
 		return (NULL);
 
 	if (nio->io_done == vdev_queue_agg_io_done) {
 		while ((dio = zio_walk_parents(nio, &zl)) != NULL) {
 			ASSERT3U(dio->io_type, ==, nio->io_type);
 			zio_vdev_io_bypass(dio);
 			zio_execute(dio);
 		}
 		zio_nowait(nio);
 		return (NULL);
 	}
 
 	return (nio);
 }
 
 void
 vdev_queue_io_done(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *dio, *nio;
 	zio_link_t *zl = NULL;
 
 	hrtime_t now = gethrtime();
 	vq->vq_io_complete_ts = now;
 	vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
 
 	mutex_enter(&vq->vq_lock);
 	vdev_queue_pending_remove(vq, zio);
 
 	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 		mutex_exit(&vq->vq_lock);
 		if (nio->io_done == vdev_queue_agg_io_done) {
 			while ((dio = zio_walk_parents(nio, &zl)) != NULL) {
 				ASSERT3U(dio->io_type, ==, nio->io_type);
 				zio_vdev_io_bypass(dio);
 				zio_execute(dio);
 			}
 			zio_nowait(nio);
 		} else {
 			zio_vdev_io_reissue(nio);
 			zio_execute(nio);
 		}
 		mutex_enter(&vq->vq_lock);
 	}
 
 	mutex_exit(&vq->vq_lock);
 }
 
 void
 vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 
 	/*
 	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
 	 * code to issue IOs without adding them to the vdev queue. In this
 	 * case, the zio is already going to be issued as quickly as possible
 	 * and so it doesn't need any reprioritization to help.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW)
 		return;
 
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (priority != ZIO_PRIORITY_SYNC_READ &&
 		    priority != ZIO_PRIORITY_ASYNC_READ &&
 		    priority != ZIO_PRIORITY_SCRUB)
 			priority = ZIO_PRIORITY_ASYNC_READ;
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		if (priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    priority != ZIO_PRIORITY_ASYNC_WRITE)
 			priority = ZIO_PRIORITY_ASYNC_WRITE;
 	}
 
 	mutex_enter(&vq->vq_lock);
 
 	/*
 	 * If the zio is in none of the queues we can simply change
 	 * the priority. If the zio is waiting to be submitted we must
 	 * remove it from the queue and re-insert it with the new priority.
 	 * Otherwise, the zio is currently active and we cannot change its
 	 * priority.
 	 */
 	if (zio->io_queue_state == ZIO_QS_QUEUED) {
 		vdev_queue_class_remove(vq, zio);
 		zio->io_priority = priority;
 		vdev_queue_class_add(vq, zio);
 	} else if (zio->io_queue_state == ZIO_QS_NONE) {
 		zio->io_priority = priority;
 	}
 
 	mutex_exit(&vq->vq_lock);
 }
 
 boolean_t
 vdev_queue_pool_busy(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 
 	return (dp->dp_dirty_total > min_bytes);
 }
 
 /*
  * As these two methods are only used for load calculations we're not
  * concerned if we get an incorrect value on 32bit platforms due to lack of
  * vq_lock mutex use here, instead we prefer to keep it lock free for
  * performance.
  */
 uint32_t
 vdev_queue_length(vdev_t *vd)
 {
 	return (vd->vdev_queue.vq_active);
 }
 
 uint64_t
 vdev_queue_last_offset(vdev_t *vd)
 {
 	return (vd->vdev_queue.vq_last_offset);
 }
 
 uint64_t
 vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 	if (vdev_queue_class_fifo(p))
 		return (vq->vq_class[p].vqc_list_numnodes);
 	else
 		return (avl_numnodes(&vq->vq_class[p].vqc_tree));
 }
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
 	"Max vdev I/O aggregation size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
 	ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
 	"Aggregate read I/O over gap");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, UINT, ZMOD_RW,
 	"Aggregate write I/O over gap");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, UINT, ZMOD_RW,
 	"Maximum number of active I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent,
 	UINT, ZMOD_RW, "Async write concurrency max threshold");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent,
 	UINT, ZMOD_RW, "Async write concurrency min threshold");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, UINT, ZMOD_RW,
 	"Max active async read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, UINT, ZMOD_RW,
 	"Min active async read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, UINT, ZMOD_RW,
 	"Max active async write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, UINT, ZMOD_RW,
 	"Min active async write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, UINT, ZMOD_RW,
 	"Max active initializing I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, UINT, ZMOD_RW,
 	"Min active initializing I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, UINT, ZMOD_RW,
 	"Max active removal I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, UINT, ZMOD_RW,
 	"Min active removal I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, UINT, ZMOD_RW,
 	"Max active scrub I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, UINT, ZMOD_RW,
 	"Min active scrub I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, UINT, ZMOD_RW,
 	"Max active sync read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, UINT, ZMOD_RW,
 	"Min active sync read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, UINT, ZMOD_RW,
 	"Max active sync write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, UINT, ZMOD_RW,
 	"Min active sync write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, UINT, ZMOD_RW,
 	"Max active trim/discard I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, UINT, ZMOD_RW,
 	"Min active trim/discard I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, UINT, ZMOD_RW,
 	"Max active rebuild I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, UINT, ZMOD_RW,
 	"Min active rebuild I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, UINT, ZMOD_RW,
 	"Number of non-interactive I/Os to allow in sequence");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW,
 	"Number of non-interactive I/Os before _max_active");
diff --git a/sys/contrib/openzfs/module/zfs/vdev_removal.c b/sys/contrib/openzfs/module/zfs/vdev_removal.c
index 2f7a739da241..2ce0121324ad 100644
--- a/sys/contrib/openzfs/module/zfs/vdev_removal.c
+++ b/sys/contrib/openzfs/module/zfs/vdev_removal.c
@@ -1,2581 +1,2581 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2019, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_dir.h>
 #include <sys/arc.h>
 #include <sys/zfeature.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 #include <sys/trace_zfs.h>
 
 /*
  * This file contains the necessary logic to remove vdevs from a
  * storage pool.  Currently, the only devices that can be removed
  * are log, cache, and spare devices; and top level vdevs from a pool
  * w/o raidz or mirrors.  (Note that members of a mirror can be removed
  * by the detach operation.)
  *
  * Log vdevs are removed by evacuating them and then turning the vdev
  * into a hole vdev while holding spa config locks.
  *
  * Top level vdevs are removed and converted into an indirect vdev via
  * a multi-step process:
  *
  *  - Disable allocations from this device (spa_vdev_remove_top).
  *
  *  - From a new thread (spa_vdev_remove_thread), copy data from
  *    the removing vdev to a different vdev.  The copy happens in open
  *    context (spa_vdev_copy_impl) and issues a sync task
  *    (vdev_mapping_sync) so the sync thread can update the partial
  *    indirect mappings in core and on disk.
  *
  *  - If a free happens during a removal, it is freed from the
  *    removing vdev, and if it has already been copied, from the new
  *    location as well (free_from_removing_vdev).
  *
  *  - After the removal is completed, the copy thread converts the vdev
  *    into an indirect vdev (vdev_remove_complete) before instructing
  *    the sync thread to destroy the space maps and finish the removal
  *    (spa_finish_removal).
  */
 
 typedef struct vdev_copy_arg {
 	metaslab_t	*vca_msp;
 	uint64_t	vca_outstanding_bytes;
 	uint64_t	vca_read_error_bytes;
 	uint64_t	vca_write_error_bytes;
 	kcondvar_t	vca_cv;
 	kmutex_t	vca_lock;
 } vdev_copy_arg_t;
 
 /*
  * The maximum amount of memory we can use for outstanding i/o while
  * doing a device removal.  This determines how much i/o we can have
  * in flight concurrently.
  */
 static const uint_t zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 
 /*
  * The largest contiguous segment that we will attempt to allocate when
  * removing a device.  This can be no larger than SPA_MAXBLOCKSIZE.  If
  * there is a performance problem with attempting to allocate large blocks,
  * consider decreasing this.
  *
  * See also the accessor function spa_remove_max_segment().
  */
-uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
+static uint_t zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
 
 /*
  * Ignore hard IO errors during device removal.  When set if a device
  * encounters hard IO error during the removal process the removal will
  * not be cancelled.  This can result in a normally recoverable block
  * becoming permanently damaged and is not recommended.
  */
 static int zfs_removal_ignore_errors = 0;
 
 /*
  * Allow a remap segment to span free chunks of at most this size. The main
  * impact of a larger span is that we will read and write larger, more
  * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
  * for iops.  The value here was chosen to align with
  * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
  * reads (but there's no reason it has to be the same).
  *
  * Additionally, a higher span will have the following relatively minor
  * effects:
  *  - the mapping will be smaller, since one entry can cover more allocated
  *    segments
  *  - more of the fragmentation in the removing device will be preserved
  *  - we'll do larger allocations, which may fail and fall back on smaller
  *    allocations
  */
 uint_t vdev_removal_max_span = 32 * 1024;
 
 /*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
-int zfs_removal_suspend_progress = 0;
+static int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
 static __attribute__((noreturn)) void spa_vdev_remove_thread(void *arg);
 static int spa_vdev_remove_cancel_impl(spa_t *spa);
 
 static void
 spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx)
 {
 	VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_REMOVING, sizeof (uint64_t),
 	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
 	    &spa->spa_removing_phys, tx));
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid =
 		    fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 vdev_activate(vdev_t *vd)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 
 	ASSERT(!vd->vdev_islog);
 	ASSERT(vd->vdev_noalloc);
 
 	metaslab_group_activate(mg);
 	metaslab_group_activate(vd->vdev_log_mg);
 
 	vdev_update_nonallocating_space(vd, B_FALSE);
 
 	vd->vdev_noalloc = B_FALSE;
 }
 
 static int
 vdev_passivate(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	int error;
 
 	ASSERT(!vd->vdev_noalloc);
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	metaslab_group_t *mg = vd->vdev_mg;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	if (mg->mg_class == normal) {
 		/*
 		 * We must check that this is not the only allocating device in
 		 * the pool before passivating, otherwise we will not be able
 		 * to make progress because we can't allocate from any vdevs.
 		 */
 		boolean_t last = B_TRUE;
 		for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 			vdev_t *cvd = rvd->vdev_child[id];
 
 			if (cvd == vd || !vdev_is_concrete(cvd) ||
 			    vdev_is_dead(cvd))
 				continue;
 
 			metaslab_class_t *mc = cvd->vdev_mg->mg_class;
 			if (mc != normal)
 				continue;
 
 			if (!cvd->vdev_noalloc) {
 				last = B_FALSE;
 				break;
 			}
 		}
 		if (last)
 			return (SET_ERROR(EINVAL));
 	}
 
 	metaslab_group_passivate(mg);
 	ASSERT(!vd->vdev_islog);
 	metaslab_group_passivate(vd->vdev_log_mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
 	 * and then wait for the deferral of those frees to finish.
 	 */
 	spa_vdev_config_exit(spa, NULL,
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 	/*
 	 * We must ensure that no "stubby" log blocks are allocated
 	 * on the device to be removed.  These blocks could be
 	 * written at any time, including while we are in the middle
 	 * of copying them.
 	 */
 	error = spa_reset_logs(spa);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		ASSERT(!vd->vdev_islog);
 		if (vd->vdev_log_mg != NULL)
 			metaslab_group_activate(vd->vdev_log_mg);
 		return (error);
 	}
 
 	vdev_update_nonallocating_space(vd, B_TRUE);
 	vd->vdev_noalloc = B_TRUE;
 
 	return (0);
 }
 
 /*
  * Turn off allocations for a top-level device from the pool.
  *
  * Turning off allocations for a top-level device can take a significant
  * amount of time. As a result we use the spa_vdev_config_[enter/exit]
  * functions which allow us to grab and release the spa_config_lock while
  * still holding the namespace lock. During each step the configuration
  * is synced out.
  */
 int
 spa_vdev_noalloc(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		error = SET_ERROR(ENOENT);
 	else if (vd->vdev_mg == NULL)
 		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
 	else if (!vd->vdev_noalloc)
 		error = vdev_passivate(vd, &txg);
 
 	if (error == 0) {
 		vdev_dirty_leaves(vd, VDD_DTL, txg);
 		vdev_config_dirty(vd);
 	}
 
 	error = spa_vdev_exit(spa, NULL, txg, error);
 
 	return (error);
 }
 
 int
 spa_vdev_alloc(spa_t *spa, uint64_t guid)
 {
 	vdev_t *vd;
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(!MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		error = SET_ERROR(ENOENT);
 	else if (vd->vdev_mg == NULL)
 		error = SET_ERROR(ZFS_ERR_VDEV_NOTSUP);
 	else if (!vd->vdev_removing)
 		vdev_activate(vd);
 
 	if (error == 0) {
 		vdev_dirty_leaves(vd, VDD_DTL, txg);
 		vdev_config_dirty(vd);
 	}
 
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	return (error);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, const char *name, nvlist_t **dev,
     int count, nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY0(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP));
 	}
 
 	VERIFY0(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY));
 	fnvlist_add_nvlist_array(config, name, (const nvlist_t * const *)newdev,
 	    count - 1);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 static spa_vdev_removal_t *
 spa_vdev_removal_create(vdev_t *vd)
 {
 	spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP);
 	mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL);
 	svr->svr_allocd_segs = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_allocd_segs"));
 	svr->svr_vdev_id = vd->vdev_id;
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		svr->svr_frees[i] = zfs_range_tree_create_flags(
 		    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 		    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "svr_frees"));
 		list_create(&svr->svr_new_segments[i],
 		    sizeof (vdev_indirect_mapping_entry_t),
 		    offsetof(vdev_indirect_mapping_entry_t, vime_node));
 	}
 
 	return (svr);
 }
 
 void
 spa_vdev_removal_destroy(spa_vdev_removal_t *svr)
 {
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(svr->svr_bytes_done[i]);
 		ASSERT0(svr->svr_max_offset_to_sync[i]);
 		zfs_range_tree_destroy(svr->svr_frees[i]);
 		list_destroy(&svr->svr_new_segments[i]);
 	}
 
 	zfs_range_tree_destroy(svr->svr_allocd_segs);
 	mutex_destroy(&svr->svr_lock);
 	cv_destroy(&svr->svr_cv);
 	kmem_free(svr, sizeof (*svr));
 }
 
 /*
  * This is called as a synctask in the txg in which we will mark this vdev
  * as removing (in the config stored in the MOS).
  *
  * It begins the evacuation of a toplevel vdev by:
  * - initializing the spa_removing_phys which tracks this removal
  * - computing the amount of space to remove for accounting purposes
  * - dirtying all dbufs in the spa_config_object
  * - creating the spa_vdev_removal
  * - starting the spa_vdev_remove_thread
  */
 static void
 vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx)
 {
 	int vdev_id = (uintptr_t)arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	objset_t *mos = spa->spa_dsl_pool->dp_meta_objset;
 	spa_vdev_removal_t *svr = NULL;
 	uint64_t txg __maybe_unused = dmu_tx_get_txg(tx);
 
 	ASSERT0(vdev_get_nparity(vd));
 	svr = spa_vdev_removal_create(vd);
 
 	ASSERT(vd->vdev_removing);
 	ASSERT0P(vd->vdev_indirect_mapping);
 
 	spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		/*
 		 * By activating the OBSOLETE_COUNTS feature, we prevent
 		 * the pool from being downgraded and ensure that the
 		 * refcounts are precise.
 		 */
 		spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		uint64_t one = 1;
 		VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1,
 		    &one, tx));
 		boolean_t are_precise __maybe_unused;
 		ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 		ASSERT3B(are_precise, ==, B_TRUE);
 	}
 
 	vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx);
 	vd->vdev_indirect_mapping =
 	    vdev_indirect_mapping_open(mos, vic->vic_mapping_object);
 	vic->vic_births_object = vdev_indirect_births_alloc(mos, tx);
 	vd->vdev_indirect_births =
 	    vdev_indirect_births_open(mos, vic->vic_births_object);
 	spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id;
 	spa->spa_removing_phys.sr_start_time = gethrestime_sec();
 	spa->spa_removing_phys.sr_end_time = 0;
 	spa->spa_removing_phys.sr_state = DSS_SCANNING;
 	spa->spa_removing_phys.sr_to_copy = 0;
 	spa->spa_removing_phys.sr_copied = 0;
 
 	/*
 	 * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because
 	 * there may be space in the defer tree, which is free, but still
 	 * counted in vs_alloc.
 	 */
 	for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
 		metaslab_t *ms = vd->vdev_ms[i];
 		if (ms->ms_sm == NULL)
 			continue;
 
 		spa->spa_removing_phys.sr_to_copy +=
 		    metaslab_allocated_space(ms);
 
 		/*
 		 * Space which we are freeing this txg does not need to
 		 * be copied.
 		 */
 		spa->spa_removing_phys.sr_to_copy -=
 		    zfs_range_tree_space(ms->ms_freeing);
 
 		ASSERT0(zfs_range_tree_space(ms->ms_freed));
 		for (int t = 0; t < TXG_SIZE; t++)
 			ASSERT0(zfs_range_tree_space(ms->ms_allocating[t]));
 	}
 
 	/*
 	 * Sync tasks are called before metaslab_sync(), so there should
 	 * be no already-synced metaslabs in the TXG_CLEAN list.
 	 */
 	ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL);
 
 	spa_sync_removing_state(spa, tx);
 
 	/*
 	 * All blocks that we need to read the most recent mapping must be
 	 * stored on concrete vdevs.  Therefore, we must dirty anything that
 	 * is read before spa_remove_init().  Specifically, the
 	 * spa_config_object.  (Note that although we already modified the
 	 * spa_config_object in spa_sync_removing_state, that may not have
 	 * modified all blocks of the object.)
 	 */
 	dmu_object_info_t doi;
 	VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi));
 	for (uint64_t offset = 0; offset < doi.doi_max_offset; ) {
 		dmu_buf_t *dbuf;
 		VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    offset, FTAG, &dbuf, 0));
 		dmu_buf_will_dirty(dbuf, tx);
 		offset += dbuf->db_size;
 		dmu_buf_rele(dbuf, FTAG);
 	}
 
 	/*
 	 * Now that we've allocated the im_object, dirty the vdev to ensure
 	 * that the object gets written to the config on disk.
 	 */
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu "
 	    "im_obj=%llu", (u_longlong_t)vd->vdev_id, vd,
 	    (u_longlong_t)dmu_tx_get_txg(tx),
 	    (u_longlong_t)vic->vic_mapping_object);
 
 	spa_history_log_internal(spa, "vdev remove started", tx,
 	    "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 	/*
 	 * Setting spa_vdev_removal causes subsequent frees to call
 	 * free_from_removing_vdev().  Note that we don't need any locking
 	 * because we are the sync thread, and metaslab_free_impl() is only
 	 * called from syncing context (potentially from a zio taskq thread,
 	 * but in any case only when there are outstanding free i/os, which
 	 * there are not).
 	 */
 	ASSERT0P(spa->spa_vdev_removal);
 	spa->spa_vdev_removal = svr;
 	svr->svr_thread = thread_create(NULL, 0,
 	    spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
  * When we are opening a pool, we must read the mapping for each
  * indirect vdev in order from most recently removed to least
  * recently removed.  We do this because the blocks for the mapping
  * of older indirect vdevs may be stored on more recently removed vdevs.
  * In order to read each indirect mapping object, we must have
  * initialized all more recently removed vdevs.
  */
 int
 spa_remove_init(spa_t *spa)
 {
 	int error;
 
 	error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_REMOVING, sizeof (uint64_t),
 	    sizeof (spa->spa_removing_phys) / sizeof (uint64_t),
 	    &spa->spa_removing_phys);
 
 	if (error == ENOENT) {
 		spa->spa_removing_phys.sr_state = DSS_NONE;
 		spa->spa_removing_phys.sr_removing_vdev = -1;
 		spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 		spa->spa_indirect_vdevs_loaded = B_TRUE;
 		return (0);
 	} else if (error != 0) {
 		return (error);
 	}
 
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING) {
 		/*
 		 * We are currently removing a vdev.  Create and
 		 * initialize a spa_vdev_removal_t from the bonus
 		 * buffer of the removing vdevs vdev_im_object, and
 		 * initialize its partial mapping.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 		vdev_t *vd = vdev_lookup_top(spa,
 		    spa->spa_removing_phys.sr_removing_vdev);
 
 		if (vd == NULL) {
 			spa_config_exit(spa, SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT(vdev_is_concrete(vd));
 		spa_vdev_removal_t *svr = spa_vdev_removal_create(vd);
 		ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id);
 		ASSERT(vd->vdev_removing);
 
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		spa->spa_vdev_removal = svr;
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	uint64_t indirect_vdev_id =
 	    spa->spa_removing_phys.sr_prev_indirect_vdev;
 	while (indirect_vdev_id != UINT64_MAX) {
 		vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 		vd->vdev_indirect_mapping = vdev_indirect_mapping_open(
 		    spa->spa_meta_objset, vic->vic_mapping_object);
 		vd->vdev_indirect_births = vdev_indirect_births_open(
 		    spa->spa_meta_objset, vic->vic_births_object);
 
 		indirect_vdev_id = vic->vic_prev_indirect_vdev;
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	/*
 	 * Now that we've loaded all the indirect mappings, we can allow
 	 * reads from other blocks (e.g. via predictive prefetch).
 	 */
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 	return (0);
 }
 
 void
 spa_restart_removal(spa_t *spa)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	if (svr == NULL)
 		return;
 
 	/*
 	 * In general when this function is called there is no
 	 * removal thread running. The only scenario where this
 	 * is not true is during spa_import() where this function
 	 * is called twice [once from spa_import_impl() and
 	 * spa_async_resume()]. Thus, in the scenario where we
 	 * import a pool that has an ongoing removal we don't
 	 * want to spawn a second thread.
 	 */
 	if (svr->svr_thread != NULL)
 		return;
 
 	if (!spa_writeable(spa))
 		return;
 
 	zfs_dbgmsg("restarting removal of %llu",
 	    (u_longlong_t)svr->svr_vdev_id);
 	svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa,
 	    0, &p0, TS_RUN, minclsyspri);
 }
 
 /*
  * Process freeing from a device which is in the middle of being removed.
  * We must handle this carefully so that we attempt to copy freed data,
  * and we correctly free already-copied data.
  */
 void
 free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size)
 {
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t txg = spa_syncing_txg(spa);
 	uint64_t max_offset_yet = 0;
 
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vim));
 	ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id);
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Remove the segment from the removing vdev's spacemap.  This
 	 * ensures that we will not attempt to copy this space (if the
 	 * removal thread has not yet visited it), and also ensures
 	 * that we know what is actually allocated on the new vdevs
 	 * (needed if we cancel the removal).
 	 *
 	 * Note: we must do the metaslab_free_concrete() with the svr_lock
 	 * held, so that the remove_thread can not load this metaslab and then
 	 * visit this offset between the time that we metaslab_free_concrete()
 	 * and when we check to see if it has been visited.
 	 *
 	 * Note: The checkpoint flag is set to false as having/taking
 	 * a checkpoint and removing a device can't happen at the same
 	 * time.
 	 */
 	ASSERT(!spa_has_checkpoint(spa));
 	metaslab_free_concrete(vd, offset, size, B_FALSE);
 
 	uint64_t synced_size = 0;
 	uint64_t synced_offset = 0;
 	uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim);
 	if (offset < max_offset_synced) {
 		/*
 		 * The mapping for this offset is already on disk.
 		 * Free from the new location.
 		 *
 		 * Note that we use svr_max_synced_offset because it is
 		 * updated atomically with respect to the in-core mapping.
 		 * By contrast, vim_max_offset is not.
 		 *
 		 * This block may be split between a synced entry and an
 		 * in-flight or unvisited entry.  Only process the synced
 		 * portion of it here.
 		 */
 		synced_size = MIN(size, max_offset_synced - offset);
 		synced_offset = offset;
 
 		ASSERT3U(max_offset_yet, <=, max_offset_synced);
 		max_offset_yet = max_offset_synced;
 
 		DTRACE_PROBE3(remove__free__synced,
 		    spa_t *, spa,
 		    uint64_t, offset,
 		    uint64_t, synced_size);
 
 		size -= synced_size;
 		offset += synced_size;
 	}
 
 	/*
 	 * Look at all in-flight txgs starting from the currently syncing one
 	 * and see if a section of this free is being copied. By starting from
 	 * this txg and iterating forward, we might find that this region
 	 * was copied in two different txgs and handle it appropriately.
 	 */
 	for (int i = 0; i < TXG_CONCURRENT_STATES; i++) {
 		int txgoff = (txg + i) & TXG_MASK;
 		if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) {
 			/*
 			 * The mapping for this offset is in flight, and
 			 * will be synced in txg+i.
 			 */
 			uint64_t inflight_size = MIN(size,
 			    svr->svr_max_offset_to_sync[txgoff] - offset);
 
 			DTRACE_PROBE4(remove__free__inflight,
 			    spa_t *, spa,
 			    uint64_t, offset,
 			    uint64_t, inflight_size,
 			    uint64_t, txg + i);
 
 			/*
 			 * We copy data in order of increasing offset.
 			 * Therefore the max_offset_to_sync[] must increase
 			 * (or be zero, indicating that nothing is being
 			 * copied in that txg).
 			 */
 			if (svr->svr_max_offset_to_sync[txgoff] != 0) {
 				ASSERT3U(svr->svr_max_offset_to_sync[txgoff],
 				    >=, max_offset_yet);
 				max_offset_yet =
 				    svr->svr_max_offset_to_sync[txgoff];
 			}
 
 			/*
 			 * We've already committed to copying this segment:
 			 * we have allocated space elsewhere in the pool for
 			 * it and have an IO outstanding to copy the data. We
 			 * cannot free the space before the copy has
 			 * completed, or else the copy IO might overwrite any
 			 * new data. To free that space, we record the
 			 * segment in the appropriate svr_frees tree and free
 			 * the mapped space later, in the txg where we have
 			 * completed the copy and synced the mapping (see
 			 * vdev_mapping_sync).
 			 */
 			zfs_range_tree_add(svr->svr_frees[txgoff],
 			    offset, inflight_size);
 			size -= inflight_size;
 			offset += inflight_size;
 
 			/*
 			 * This space is already accounted for as being
 			 * done, because it is being copied in txg+i.
 			 * However, if i!=0, then it is being copied in
 			 * a future txg.  If we crash after this txg
 			 * syncs but before txg+i syncs, then the space
 			 * will be free.  Therefore we must account
 			 * for the space being done in *this* txg
 			 * (when it is freed) rather than the future txg
 			 * (when it will be copied).
 			 */
 			ASSERT3U(svr->svr_bytes_done[txgoff], >=,
 			    inflight_size);
 			svr->svr_bytes_done[txgoff] -= inflight_size;
 			svr->svr_bytes_done[txg & TXG_MASK] += inflight_size;
 		}
 	}
 	ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]);
 
 	if (size > 0) {
 		/*
 		 * The copy thread has not yet visited this offset.  Ensure
 		 * that it doesn't.
 		 */
 
 		DTRACE_PROBE3(remove__free__unvisited,
 		    spa_t *, spa,
 		    uint64_t, offset,
 		    uint64_t, size);
 
 		if (svr->svr_allocd_segs != NULL)
 			zfs_range_tree_clear(svr->svr_allocd_segs, offset,
 			    size);
 
 		/*
 		 * Since we now do not need to copy this data, for
 		 * accounting purposes we have done our job and can count
 		 * it as completed.
 		 */
 		svr->svr_bytes_done[txg & TXG_MASK] += size;
 	}
 	mutex_exit(&svr->svr_lock);
 
 	/*
 	 * Now that we have dropped svr_lock, process the synced portion
 	 * of this free.
 	 */
 	if (synced_size > 0) {
 		vdev_indirect_mark_obsolete(vd, synced_offset, synced_size);
 
 		/*
 		 * Note: this can only be called from syncing context,
 		 * and the vdev_indirect_mapping is only changed from the
 		 * sync thread, so we don't need svr_lock while doing
 		 * metaslab_free_impl_cb.
 		 */
 		boolean_t checkpoint = B_FALSE;
 		vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size,
 		    metaslab_free_impl_cb, &checkpoint);
 	}
 }
 
 /*
  * Stop an active removal and update the spa_removing phys.
  */
 static void
 spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa));
 
 	/* Ensure the removal thread has completed before we free the svr. */
 	spa_vdev_remove_suspend(spa);
 
 	ASSERT(state == DSS_FINISHED || state == DSS_CANCELED);
 
 	if (state == DSS_FINISHED) {
 		spa_removing_phys_t *srp = &spa->spa_removing_phys;
 		vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 
 		if (srp->sr_prev_indirect_vdev != -1) {
 			vdev_t *pvd;
 			pvd = vdev_lookup_top(spa,
 			    srp->sr_prev_indirect_vdev);
 			ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops);
 		}
 
 		vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev;
 		srp->sr_prev_indirect_vdev = vd->vdev_id;
 	}
 	spa->spa_removing_phys.sr_state = state;
 	spa->spa_removing_phys.sr_end_time = gethrestime_sec();
 
 	spa->spa_vdev_removal = NULL;
 	spa_vdev_removal_destroy(svr);
 
 	spa_sync_removing_state(spa, tx);
 	spa_notify_waiters(spa);
 
 	vdev_config_dirty(spa->spa_root_vdev);
 }
 
 static void
 free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size)
 {
 	vdev_t *vd = arg;
 	vdev_indirect_mark_obsolete(vd, offset, size);
 	boolean_t checkpoint = B_FALSE;
 	vdev_indirect_ops.vdev_op_remap(vd, offset, size,
 	    metaslab_free_impl_cb, &checkpoint);
 }
 
 /*
  * On behalf of the removal thread, syncs an incremental bit more of
  * the indirect mapping to disk and updates the in-memory mapping.
  * Called as a sync task in every txg that the removal thread makes progress.
  */
 static void
 vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config;
 	uint64_t txg = dmu_tx_get_txg(tx);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 	ASSERT(vic->vic_mapping_object != 0);
 	ASSERT3U(txg, ==, spa_syncing_txg(spa));
 
 	vdev_indirect_mapping_add_entries(vim,
 	    &svr->svr_new_segments[txg & TXG_MASK], tx);
 	vdev_indirect_births_add_entry(vd->vdev_indirect_births,
 	    vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx);
 
 	/*
 	 * Free the copied data for anything that was freed while the
 	 * mapping entries were in flight.
 	 */
 	mutex_enter(&svr->svr_lock);
 	zfs_range_tree_vacate(svr->svr_frees[txg & TXG_MASK],
 	    free_mapped_segment_cb, vd);
 	ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=,
 	    vdev_indirect_mapping_max_offset(vim));
 	svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0;
 	mutex_exit(&svr->svr_lock);
 
 	spa_sync_removing_state(spa, tx);
 }
 
 typedef struct vdev_copy_segment_arg {
 	spa_t *vcsa_spa;
 	dva_t *vcsa_dest_dva;
 	uint64_t vcsa_txg;
 	zfs_range_tree_t *vcsa_obsolete_segs;
 } vdev_copy_segment_arg_t;
 
 static void
 unalloc_seg(void *arg, uint64_t start, uint64_t size)
 {
 	vdev_copy_segment_arg_t *vcsa = arg;
 	spa_t *spa = vcsa->vcsa_spa;
 	blkptr_t bp = { { { {0} } } };
 
 	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
 	BP_SET_LSIZE(&bp, size);
 	BP_SET_PSIZE(&bp, size);
 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
 	BP_SET_TYPE(&bp, DMU_OT_NONE);
 	BP_SET_LEVEL(&bp, 0);
 	BP_SET_DEDUP(&bp, 0);
 	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
 
 	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
 	DVA_SET_OFFSET(&bp.blk_dva[0],
 	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
 	DVA_SET_ASIZE(&bp.blk_dva[0], size);
 
 	zio_free(spa, vcsa->vcsa_txg, &bp);
 }
 
 /*
  * All reads and writes associated with a call to spa_vdev_copy_segment()
  * are done.
  */
 static void
 spa_vdev_copy_segment_done(zio_t *zio)
 {
 	vdev_copy_segment_arg_t *vcsa = zio->io_private;
 
 	zfs_range_tree_vacate(vcsa->vcsa_obsolete_segs,
 	    unalloc_seg, vcsa);
 	zfs_range_tree_destroy(vcsa->vcsa_obsolete_segs);
 	kmem_free(vcsa, sizeof (*vcsa));
 
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
 /*
  * The write of the new location is done.
  */
 static void
 spa_vdev_copy_segment_write_done(zio_t *zio)
 {
 	vdev_copy_arg_t *vca = zio->io_private;
 
 	abd_free(zio->io_abd);
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes -= zio->io_size;
 
 	if (zio->io_error != 0)
 		vca->vca_write_error_bytes += zio->io_size;
 
 	cv_signal(&vca->vca_cv);
 	mutex_exit(&vca->vca_lock);
 }
 
 /*
  * The read of the old location is done.  The parent zio is the write to
  * the new location.  Allow it to start.
  */
 static void
 spa_vdev_copy_segment_read_done(zio_t *zio)
 {
 	vdev_copy_arg_t *vca = zio->io_private;
 
 	if (zio->io_error != 0) {
 		mutex_enter(&vca->vca_lock);
 		vca->vca_read_error_bytes += zio->io_size;
 		mutex_exit(&vca->vca_lock);
 	}
 
 	zio_nowait(zio_unique_parent(zio));
 }
 
 /*
  * If the old and new vdevs are mirrors, we will read both sides of the old
  * mirror, and write each copy to the corresponding side of the new mirror.
  * If the old and new vdevs have a different number of children, we will do
  * this as best as possible.  Since we aren't verifying checksums, this
  * ensures that as long as there's a good copy of the data, we'll have a
  * good copy after the removal, even if there's silent damage to one side
  * of the mirror. If we're removing a mirror that has some silent damage,
  * we'll have exactly the same damage in the new location (assuming that
  * the new location is also a mirror).
  *
  * We accomplish this by creating a tree of zio_t's, with as many writes as
  * there are "children" of the new vdev (a non-redundant vdev counts as one
  * child, a 2-way mirror has 2 children, etc). Each write has an associated
  * read from a child of the old vdev. Typically there will be the same
  * number of children of the old and new vdevs.  However, if there are more
  * children of the new vdev, some child(ren) of the old vdev will be issued
  * multiple reads.  If there are more children of the old vdev, some copies
  * will be dropped.
  *
  * For example, the tree of zio_t's for a 2-way mirror is:
  *
  *                            null
  *                           /    \
  *    write(new vdev, child 0)      write(new vdev, child 1)
  *      |                             |
  *    read(old vdev, child 0)       read(old vdev, child 1)
  *
  * Child zio's complete before their parents complete.  However, zio's
  * created with zio_vdev_child_io() may be issued before their children
  * complete.  In this case we need to make sure that the children (reads)
  * complete before the parents (writes) are *issued*.  We do this by not
  * calling zio_nowait() on each write until its corresponding read has
  * completed.
  *
  * The spa_config_lock must be held while zio's created by
  * zio_vdev_child_io() are in progress, to ensure that the vdev tree does
  * not change (e.g. due to a concurrent "zpool attach/detach"). The "null"
  * zio is needed to release the spa_config_lock after all the reads and
  * writes complete. (Note that we can't grab the config lock for each read,
  * because it is not reentrant - we could deadlock with a thread waiting
  * for a write lock.)
  */
 static void
 spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
     vdev_t *source_vd, uint64_t source_offset,
     vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size)
 {
 	ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0);
 
 	/*
 	 * If the destination child in unwritable then there is no point
 	 * in issuing the source reads which cannot be written.
 	 */
 	if (!vdev_writeable(dest_child_vd))
 		return;
 
 	mutex_enter(&vca->vca_lock);
 	vca->vca_outstanding_bytes += size;
 	mutex_exit(&vca->vca_lock);
 
 	abd_t *abd = abd_alloc_for_io(size, B_FALSE);
 
 	vdev_t *source_child_vd = NULL;
 	if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) {
 		/*
 		 * Source and dest are both mirrors.  Copy from the same
 		 * child id as we are copying to (wrapping around if there
 		 * are more dest children than source children).  If the
 		 * preferred source child is unreadable select another.
 		 */
 		for (int i = 0; i < source_vd->vdev_children; i++) {
 			source_child_vd = source_vd->vdev_child[
 			    (dest_id + i) % source_vd->vdev_children];
 			if (vdev_readable(source_child_vd))
 				break;
 		}
 	} else {
 		source_child_vd = source_vd;
 	}
 
 	/*
 	 * There should always be at least one readable source child or
 	 * the pool would be in a suspended state.  Somehow selecting an
 	 * unreadable child would result in IO errors, the removal process
 	 * being cancelled, and the pool reverting to its pre-removal state.
 	 */
 	ASSERT3P(source_child_vd, !=, NULL);
 
 	zio_t *write_zio = zio_vdev_child_io(nzio, NULL,
 	    dest_child_vd, dest_offset, abd, size,
 	    ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    spa_vdev_copy_segment_write_done, vca);
 
 	zio_nowait(zio_vdev_child_io(write_zio, NULL,
 	    source_child_vd, source_offset, abd, size,
 	    ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
 	    ZIO_FLAG_CANFAIL,
 	    spa_vdev_copy_segment_read_done, vca));
 }
 
 /*
  * Allocate a new location for this segment, and create the zio_t's to
  * read from the old location and write to the new location.
  */
 static int
 spa_vdev_copy_segment(vdev_t *vd, zfs_range_tree_t *segs,
     uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
 	dva_t dst = {{ 0 }};
 	uint64_t start = zfs_range_tree_min(segs);
 	ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift));
 
 	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 	ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift));
 
 	uint64_t size = zfs_range_tree_span(segs);
 	if (zfs_range_tree_span(segs) > maxalloc) {
 		/*
 		 * We can't allocate all the segments.  Prefer to end
 		 * the allocation at the end of a segment, thus avoiding
 		 * additional split blocks.
 		 */
 		zfs_range_seg_max_t search;
 		zfs_btree_index_t where;
 		zfs_rs_set_start(&search, segs, start + maxalloc);
 		zfs_rs_set_end(&search, segs, start + maxalloc);
 		(void) zfs_btree_find(&segs->rt_root, &search, &where);
 		zfs_range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where,
 		    &where);
 		if (rs != NULL) {
 			size = zfs_rs_get_end(rs, segs) - start;
 		} else {
 			/*
 			 * There are no segments that end before maxalloc.
 			 * I.e. the first segment is larger than maxalloc,
 			 * so we must split it.
 			 */
 			size = maxalloc;
 		}
 	}
 	ASSERT3U(size, <=, maxalloc);
 	ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift));
 
 	/*
 	 * An allocation class might not have any remaining vdevs or space
 	 */
 	metaslab_class_t *mc = mg->mg_class;
 	if (mc->mc_groups == 0)
 		mc = spa_normal_class(spa);
 	int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg,
 	    0, zal, 0);
 	if (error == ENOSPC && mc != spa_normal_class(spa)) {
 		error = metaslab_alloc_dva(spa, spa_normal_class(spa), size,
 		    &dst, 0, NULL, txg, 0, zal, 0);
 	}
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Determine the ranges that are not actually needed.  Offsets are
 	 * relative to the start of the range to be copied (i.e. relative to the
 	 * local variable "start").
 	 */
 	zfs_range_tree_t *obsolete_segs = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "obsolete_segs"));
 
 	zfs_btree_index_t where;
 	zfs_range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where);
 	ASSERT3U(zfs_rs_get_start(rs, segs), ==, start);
 	uint64_t prev_seg_end = zfs_rs_get_end(rs, segs);
 	while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) {
 		if (zfs_rs_get_start(rs, segs) >= start + size) {
 			break;
 		} else {
 			zfs_range_tree_add(obsolete_segs,
 			    prev_seg_end - start,
 			    zfs_rs_get_start(rs, segs) - prev_seg_end);
 		}
 		prev_seg_end = zfs_rs_get_end(rs, segs);
 	}
 	/* We don't end in the middle of an obsolete range */
 	ASSERT3U(start + size, <=, prev_seg_end);
 
 	zfs_range_tree_clear(segs, start, size);
 
 	/*
 	 * We can't have any padding of the allocated size, otherwise we will
 	 * misunderstand what's allocated, and the size of the mapping. We
 	 * prevent padding by ensuring that all devices in the pool have the
 	 * same ashift, and the allocation size is a multiple of the ashift.
 	 */
 	VERIFY3U(DVA_GET_ASIZE(&dst), ==, size);
 
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
 		entry->vime_obsolete_count =
 		    zfs_range_tree_space(obsolete_segs);
 	}
 
 	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
 	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
 	vcsa->vcsa_obsolete_segs = obsolete_segs;
 	vcsa->vcsa_spa = spa;
 	vcsa->vcsa_txg = txg;
 
 	/*
 	 * See comment before spa_vdev_copy_one_child().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
 	    spa_vdev_copy_segment_done, vcsa, 0);
 	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
 	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < dest_vd->vdev_children; i++) {
 			vdev_t *child = dest_vd->vdev_child[i];
 			spa_vdev_copy_one_child(vca, nzio, vd, start,
 			    child, DVA_GET_OFFSET(&dst), i, size);
 		}
 	} else {
 		spa_vdev_copy_one_child(vca, nzio, vd, start,
 		    dest_vd, DVA_GET_OFFSET(&dst), -1, size);
 	}
 	zio_nowait(nzio);
 
 	list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry);
 	ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift);
 	vdev_dirty(vd, 0, NULL, txg);
 
 	return (0);
 }
 
 /*
  * Complete the removal of a toplevel vdev. This is called as a
  * synctask in the same txg that we will sync out the new config (to the
  * MOS object) which indicates that this vdev is indirect.
  */
 static void
 vdev_remove_complete_sync(void *arg, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 	ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT0(svr->svr_bytes_done[i]);
 	}
 
 	ASSERT3U(spa->spa_removing_phys.sr_copied, ==,
 	    spa->spa_removing_phys.sr_to_copy);
 
 	vdev_destroy_spacemaps(vd, tx);
 
 	/* destroy leaf zaps, if any */
 	ASSERT3P(svr->svr_zaplist, !=, NULL);
 	for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL);
 	    pair != NULL;
 	    pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) {
 		vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx);
 	}
 	fnvlist_free(svr->svr_zaplist);
 
 	spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx);
 	/* vd->vdev_path is not available here */
 	spa_history_log_internal(spa, "vdev remove completed",  tx,
 	    "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id);
 }
 
 static void
 vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist)
 {
 	ASSERT3P(zlist, !=, NULL);
 	ASSERT0(vdev_get_nparity(vd));
 
 	if (vd->vdev_leaf_zap != 0) {
 		char zkey[32];
 		(void) snprintf(zkey, sizeof (zkey), "%s-%llu",
 		    VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap);
 		fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap);
 	}
 
 	for (uint64_t id = 0; id < vd->vdev_children; id++) {
 		vdev_remove_enlist_zaps(vd->vdev_child[id], zlist);
 	}
 }
 
 static void
 vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg)
 {
 	vdev_t *ivd;
 	dmu_tx_t *tx;
 	spa_t *spa = vd->vdev_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	/*
 	 * First, build a list of leaf zaps to be destroyed.
 	 * This is passed to the sync context thread,
 	 * which does the actual unlinking.
 	 */
 	svr->svr_zaplist = fnvlist_alloc();
 	vdev_remove_enlist_zaps(vd, svr->svr_zaplist);
 
 	ivd = vdev_add_parent(vd, &vdev_indirect_ops);
 	ivd->vdev_removing = 0;
 
 	vd->vdev_leaf_zap = 0;
 
 	vdev_remove_child(ivd, vd);
 	vdev_compact_children(ivd);
 
 	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	mutex_enter(&svr->svr_lock);
 	svr->svr_thread = NULL;
 	cv_broadcast(&svr->svr_cv);
 	mutex_exit(&svr->svr_lock);
 
 	/* After this, we can not use svr. */
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_complete_sync, svr, tx);
 	dmu_tx_commit(tx);
 }
 
 /*
  * Complete the removal of a toplevel vdev. This is called in open
  * context by the removal thread after we have copied all vdev's data.
  */
 static void
 vdev_remove_complete(spa_t *spa)
 {
 	uint64_t txg;
 
 	/*
 	 * Wait for any deferred frees to be synced before we call
 	 * vdev_metaslab_fini()
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	txg = spa_vdev_enter(spa);
 	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
 	ASSERT0P(vd->vdev_initialize_thread);
 	ASSERT0P(vd->vdev_trim_thread);
 	ASSERT0P(vd->vdev_autotrim_thread);
 	vdev_rebuild_stop_wait(vd);
 	ASSERT0P(vd->vdev_rebuild_thread);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
 
 	zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)txg);
 
 	/* the vdev is no longer part of the dspace */
 	vdev_update_nonallocating_space(vd, B_FALSE);
 
 	/*
 	 * Discard allocation state.
 	 */
 	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
 		metaslab_group_destroy(vd->vdev_mg);
 		vd->vdev_mg = NULL;
 	}
 	if (vd->vdev_log_mg != NULL) {
 		ASSERT0(vd->vdev_ms_count);
 		metaslab_group_destroy(vd->vdev_log_mg);
 		vd->vdev_log_mg = NULL;
 	}
 	ASSERT0(vd->vdev_stat.vs_space);
 	ASSERT0(vd->vdev_stat.vs_dspace);
 
 	vdev_remove_replace_with_indirect(vd, txg);
 
 	/*
 	 * We now release the locks, allowing spa_sync to run and finish the
 	 * removal via vdev_remove_complete_sync in syncing context.
 	 *
 	 * Note that we hold on to the vdev_t that has been replaced.  Since
 	 * it isn't part of the vdev tree any longer, it can't be concurrently
 	 * manipulated, even while we don't have the config lock.
 	 */
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	/*
 	 * Top ZAP should have been transferred to the indirect vdev in
 	 * vdev_remove_replace_with_indirect.
 	 */
 	ASSERT0(vd->vdev_top_zap);
 
 	/*
 	 * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect.
 	 */
 	ASSERT0(vd->vdev_leaf_zap);
 
 	txg = spa_vdev_enter(spa);
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 	/*
 	 * Request to update the config and the config cachefile.
 	 */
 	vdev_config_dirty(spa->spa_root_vdev);
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 }
 
 /*
  * Evacuates a segment of size at most max_alloc from the vdev
  * via repeated calls to spa_vdev_copy_segment. If an allocation
  * fails, the pool is probably too fragmented to handle such a
  * large size, so decrease max_alloc so that the caller will not try
  * this size again this txg.
  */
 static void
 spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
     uint64_t *max_alloc, dmu_tx_t *tx)
 {
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Determine how big of a chunk to copy.  We can allocate up
 	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
 	 * bytes of unallocated space at a time.  "segs" will track the
 	 * allocated segments that we are copying.  We may also be copying
 	 * free segments (of up to vdev_removal_max_span bytes).
 	 */
 	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_copy_impl:segs"));
 	for (;;) {
 		zfs_range_tree_t *rt = svr->svr_allocd_segs;
 		zfs_range_seg_t *rs = zfs_range_tree_first(rt);
 
 		if (rs == NULL)
 			break;
 
 		uint64_t seg_length;
 
 		if (zfs_range_tree_is_empty(segs)) {
 			/* need to truncate the first seg based on max_alloc */
 			seg_length = MIN(zfs_rs_get_end(rs, rt) -
 			    zfs_rs_get_start(rs, rt), *max_alloc);
 		} else {
 			if (zfs_rs_get_start(rs, rt) - zfs_range_tree_max(segs)
 			    > vdev_removal_max_span) {
 				/*
 				 * Including this segment would cause us to
 				 * copy a larger unneeded chunk than is allowed.
 				 */
 				break;
 			} else if (zfs_rs_get_end(rs, rt) -
 			    zfs_range_tree_min(segs) > *max_alloc) {
 				/*
 				 * This additional segment would extend past
 				 * max_alloc. Rather than splitting this
 				 * segment, leave it for the next mapping.
 				 */
 				break;
 			} else {
 				seg_length = zfs_rs_get_end(rs, rt) -
 				    zfs_rs_get_start(rs, rt);
 			}
 		}
 
 		zfs_range_tree_add(segs, zfs_rs_get_start(rs, rt), seg_length);
 		zfs_range_tree_remove(svr->svr_allocd_segs,
 		    zfs_rs_get_start(rs, rt), seg_length);
 	}
 
 	if (zfs_range_tree_is_empty(segs)) {
 		mutex_exit(&svr->svr_lock);
 		zfs_range_tree_destroy(segs);
 		return;
 	}
 
 	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
 		    svr, tx);
 	}
 
 	svr->svr_max_offset_to_sync[txg & TXG_MASK] = zfs_range_tree_max(segs);
 
 	/*
 	 * Note: this is the amount of *allocated* space
 	 * that we are taking care of each txg.
 	 */
 	svr->svr_bytes_done[txg & TXG_MASK] += zfs_range_tree_space(segs);
 
 	mutex_exit(&svr->svr_lock);
 
 	zio_alloc_list_t zal;
 	metaslab_trace_init(&zal);
 	uint64_t thismax = SPA_MAXBLOCKSIZE;
 	while (!zfs_range_tree_is_empty(segs)) {
 		int error = spa_vdev_copy_segment(vd,
 		    segs, thismax, txg, vca, &zal);
 
 		if (error == ENOSPC) {
 			/*
 			 * Cut our segment in half, and don't try this
 			 * segment size again this txg.  Note that the
 			 * allocation size must be aligned to the highest
 			 * ashift in the pool, so that the allocation will
 			 * not be padded out to a multiple of the ashift,
 			 * which could cause us to think that this mapping
 			 * is larger than we intended.
 			 */
 			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
 			uint64_t attempted =
 			    MIN(zfs_range_tree_span(segs), thismax);
 			thismax = P2ROUNDUP(attempted / 2,
 			    1 << spa->spa_max_ashift);
 			/*
 			 * The minimum-size allocation can not fail.
 			 */
 			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
 			*max_alloc = attempted - (1 << spa->spa_max_ashift);
 		} else {
 			ASSERT0(error);
 
 			/*
 			 * We've performed an allocation, so reset the
 			 * alloc trace list.
 			 */
 			metaslab_trace_fini(&zal);
 			metaslab_trace_init(&zal);
 		}
 	}
 	metaslab_trace_fini(&zal);
 	zfs_range_tree_destroy(segs);
 }
 
 /*
  * The size of each removal mapping is limited by the tunable
  * zfs_remove_max_segment, but we must adjust this to be a multiple of the
  * pool's ashift, so that we don't try to split individual sectors regardless
  * of the tunable value.  (Note that device removal requires that all devices
  * have the same ashift, so there's no difference between spa_min_ashift and
  * spa_max_ashift.) The raw tunable should not be used elsewhere.
  */
 uint64_t
 spa_remove_max_segment(spa_t *spa)
 {
 	return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift));
 }
 
 /*
  * The removal thread operates in open context.  It iterates over all
  * allocated space in the vdev, by loading each metaslab's spacemap.
  * For each contiguous segment of allocated space (capping the segment
  * size at SPA_MAXBLOCKSIZE), we:
  *    - Allocate space for it on another vdev.
  *    - Create a new mapping from the old location to the new location
  *      (as a record in svr_new_segments).
  *    - Initiate a physical read zio to get the data off the removing disk.
  *    - In the read zio's done callback, initiate a physical write zio to
  *      write it to the new vdev.
  * Note that all of this will take effect when a particular TXG syncs.
  * The sync thread ensures that all the phys reads and writes for the syncing
  * TXG have completed (see spa_txg_zio) and writes the new mappings to disk
  * (see vdev_mapping_sync()).
  */
 static __attribute__((noreturn)) void
 spa_vdev_remove_thread(void *arg)
 {
 	spa_t *spa = arg;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_copy_arg_t vca;
 	uint64_t max_alloc = spa_remove_max_segment(spa);
 	uint64_t last_txg = 0;
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	uint64_t start_offset = vdev_indirect_mapping_max_offset(vim);
 
 	ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
 	ASSERT(vdev_is_concrete(vd));
 	ASSERT(vd->vdev_removing);
 	ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0);
 	ASSERT(vim != NULL);
 
 	mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL);
 	vca.vca_outstanding_bytes = 0;
 	vca.vca_read_error_bytes = 0;
 	vca.vca_write_error_bytes = 0;
 
 	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
 	    ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "spa_vdev_remove_thread:segs"));
 
 	mutex_enter(&svr->svr_lock);
 
 	/*
 	 * Start from vim_max_offset so we pick up where we left off
 	 * if we are restarting the removal after opening the pool.
 	 */
 	uint64_t msi;
 	for (msi = start_offset >> vd->vdev_ms_shift;
 	    msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 		ASSERT3U(msi, <=, vd->vdev_ms_count);
 
 again:
 		ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 		mutex_exit(&svr->svr_lock);
 
 		mutex_enter(&msp->ms_sync_lock);
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++) {
 			ASSERT0(zfs_range_tree_space(msp->ms_allocating[i]));
 		}
 
 		/*
 		 * If the metaslab has ever been synced (ms_sm != NULL),
 		 * read the allocated segments from the space map object
 		 * into svr_allocd_segs. Since we do this while holding
 		 * ms_lock and ms_sync_lock, concurrent frees (which
 		 * would have modified the space map) will wait for us
 		 * to finish loading the spacemap, and then take the
 		 * appropriate action (see free_from_removing_vdev()).
 		 */
 		if (msp->ms_sm != NULL)
 			VERIFY0(space_map_load(msp->ms_sm, segs, SM_ALLOC));
 
 		/*
 		 * We could not hold svr_lock while loading space map, or we
 		 * could hit deadlock in a ZIO pipeline, having to wait for
 		 * it.  But we can not block for it here under metaslab locks,
 		 * or it would be a lock ordering violation.
 		 */
 		if (!mutex_tryenter(&svr->svr_lock)) {
 			mutex_exit(&msp->ms_lock);
 			mutex_exit(&msp->ms_sync_lock);
 			zfs_range_tree_vacate(segs, NULL, NULL);
 			mutex_enter(&svr->svr_lock);
 			goto again;
 		}
 
 		zfs_range_tree_swap(&segs, &svr->svr_allocd_segs);
 		zfs_range_tree_walk(msp->ms_unflushed_allocs,
 		    zfs_range_tree_add, svr->svr_allocd_segs);
 		zfs_range_tree_walk(msp->ms_unflushed_frees,
 		    zfs_range_tree_remove, svr->svr_allocd_segs);
 		zfs_range_tree_walk(msp->ms_freeing,
 		    zfs_range_tree_remove, svr->svr_allocd_segs);
 
 		mutex_exit(&msp->ms_lock);
 		mutex_exit(&msp->ms_sync_lock);
 
 		/*
 		 * When we are resuming from a paused removal (i.e.
 		 * when importing a pool with a removal in progress),
 		 * discard any state that we have already processed.
 		 */
 		zfs_range_tree_clear(svr->svr_allocd_segs, 0, start_offset);
 
 		vca.vca_msp = msp;
 		zfs_dbgmsg("copying %llu segments for metaslab %llu",
 		    (u_longlong_t)zfs_btree_numnodes(
 		    &svr->svr_allocd_segs->rt_root),
 		    (u_longlong_t)msp->ms_id);
 
 		while (!svr->svr_thread_exit &&
 		    !zfs_range_tree_is_empty(svr->svr_allocd_segs)) {
 
 			mutex_exit(&svr->svr_lock);
 
 			/*
 			 * We need to periodically drop the config lock so that
 			 * writers can get in.  Additionally, we can't wait
 			 * for a txg to sync while holding a config lock
 			 * (since a waiting writer could cause a 3-way deadlock
 			 * with the sync thread, which also gets a config
 			 * lock for reader).  So we can't hold the config lock
 			 * while calling dmu_tx_assign().
 			 */
 			spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 			/*
 			 * This delay will pause the removal around the point
 			 * specified by zfs_removal_suspend_progress. We do this
 			 * solely from the test suite or during debugging.
 			 */
 			while (zfs_removal_suspend_progress &&
 			    !svr->svr_thread_exit)
 				delay(hz);
 
 			mutex_enter(&vca.vca_lock);
 			while (vca.vca_outstanding_bytes >
 			    zfs_remove_max_copy_bytes) {
 				cv_wait(&vca.vca_cv, &vca.vca_lock);
 			}
 			mutex_exit(&vca.vca_lock);
 
 			dmu_tx_t *tx =
 			    dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 
 			VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT |
 			    DMU_TX_SUSPEND));
 			uint64_t txg = dmu_tx_get_txg(tx);
 
 			/*
 			 * Reacquire the vdev_config lock.  The vdev_t
 			 * that we're removing may have changed, e.g. due
 			 * to a vdev_attach or vdev_detach.
 			 */
 			spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 			vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 
 			if (txg != last_txg)
 				max_alloc = spa_remove_max_segment(spa);
 			last_txg = txg;
 
 			spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx);
 
 			dmu_tx_commit(tx);
 			mutex_enter(&svr->svr_lock);
 		}
 
 		mutex_enter(&vca.vca_lock);
 		if (zfs_removal_ignore_errors == 0 &&
 		    (vca.vca_read_error_bytes > 0 ||
 		    vca.vca_write_error_bytes > 0)) {
 			svr->svr_thread_exit = B_TRUE;
 		}
 		mutex_exit(&vca.vca_lock);
 	}
 
 	mutex_exit(&svr->svr_lock);
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	zfs_range_tree_destroy(segs);
 
 	/*
 	 * Wait for all copies to finish before cleaning up the vca.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	ASSERT0(vca.vca_outstanding_bytes);
 
 	mutex_destroy(&vca.vca_lock);
 	cv_destroy(&vca.vca_cv);
 
 	if (svr->svr_thread_exit) {
 		mutex_enter(&svr->svr_lock);
 		zfs_range_tree_vacate(svr->svr_allocd_segs, NULL, NULL);
 		svr->svr_thread = NULL;
 		cv_broadcast(&svr->svr_cv);
 		mutex_exit(&svr->svr_lock);
 
 		/*
 		 * During the removal process an unrecoverable read or write
 		 * error was encountered.  The removal process must be
 		 * cancelled or this damage may become permanent.
 		 */
 		if (zfs_removal_ignore_errors == 0 &&
 		    (vca.vca_read_error_bytes > 0 ||
 		    vca.vca_write_error_bytes > 0)) {
 			zfs_dbgmsg("canceling removal due to IO errors: "
 			    "[read_error_bytes=%llu] [write_error_bytes=%llu]",
 			    (u_longlong_t)vca.vca_read_error_bytes,
 			    (u_longlong_t)vca.vca_write_error_bytes);
 			spa_vdev_remove_cancel_impl(spa);
 		}
 	} else {
 		ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 		vdev_remove_complete(spa);
 	}
 
 	thread_exit();
 }
 
 void
 spa_vdev_remove_suspend(spa_t *spa)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 
 	if (svr == NULL)
 		return;
 
 	mutex_enter(&svr->svr_lock);
 	svr->svr_thread_exit = B_TRUE;
 	while (svr->svr_thread != NULL)
 		cv_wait(&svr->svr_cv, &svr->svr_lock);
 	svr->svr_thread_exit = B_FALSE;
 	mutex_exit(&svr->svr_lock);
 }
 
 /*
  * Return true if the "allocating" property has been set to "off"
  */
 static boolean_t
 vdev_prop_allocating_off(vdev_t *vd)
 {
 	uint64_t objid = vd->vdev_top_zap;
 	uint64_t allocating = 1;
 
 	/* no vdev property object => no props */
 	if (objid != 0) {
 		spa_t *spa = vd->vdev_spa;
 		objset_t *mos = spa->spa_meta_objset;
 
 		mutex_enter(&spa->spa_props_lock);
 		(void) zap_lookup(mos, objid, "allocating", sizeof (uint64_t),
 		    1, &allocating);
 		mutex_exit(&spa->spa_props_lock);
 	}
 	return (allocating == 0);
 }
 
 static int
 spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 	return (0);
 }
 
 /*
  * Cancel a removal by freeing all entries from the partial mapping
  * and marking the vdev as no longer being removing.
  */
 static void
 spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx)
 {
 	(void) arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id);
 	vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 	vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 	objset_t *mos = spa->spa_meta_objset;
 
 	ASSERT0P(svr->svr_thread);
 
 	spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx);
 
 	boolean_t are_precise;
 	VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise));
 	if (are_precise) {
 		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx));
 	}
 
 	uint64_t obsolete_sm_object;
 	VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 
 		space_map_free(vd->vdev_obsolete_sm, tx);
 		VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap,
 		    VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx));
 		space_map_close(vd->vdev_obsolete_sm);
 		vd->vdev_obsolete_sm = NULL;
 		spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx);
 	}
 	for (int i = 0; i < TXG_SIZE; i++) {
 		ASSERT(list_is_empty(&svr->svr_new_segments[i]));
 		ASSERT3U(svr->svr_max_offset_to_sync[i], <=,
 		    vdev_indirect_mapping_max_offset(vim));
 	}
 
 	zfs_range_tree_t *segs = zfs_range_tree_create_flags(
 	    NULL, ZFS_RANGE_SEG64, NULL, 0, 0, ZFS_RT_F_DYN_NAME,
 	    vdev_rt_name(vd, "spa_vdev_remove_cancel_sync:segs"));
 	for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) {
 		metaslab_t *msp = vd->vdev_ms[msi];
 
 		if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim))
 			break;
 
 		ASSERT0(zfs_range_tree_space(svr->svr_allocd_segs));
 
 		mutex_enter(&msp->ms_lock);
 
 		/*
 		 * Assert nothing in flight -- ms_*tree is empty.
 		 */
 		for (int i = 0; i < TXG_SIZE; i++)
 			ASSERT0(zfs_range_tree_space(msp->ms_allocating[i]));
 		for (int i = 0; i < TXG_DEFER_SIZE; i++)
 			ASSERT0(zfs_range_tree_space(msp->ms_defer[i]));
 		ASSERT0(zfs_range_tree_space(msp->ms_freed));
 
 		if (msp->ms_sm != NULL)
 			VERIFY0(space_map_load(msp->ms_sm, segs, SM_ALLOC));
 
 		zfs_range_tree_walk(msp->ms_unflushed_allocs,
 		    zfs_range_tree_add, segs);
 		zfs_range_tree_walk(msp->ms_unflushed_frees,
 		    zfs_range_tree_remove, segs);
 		zfs_range_tree_walk(msp->ms_freeing,
 		    zfs_range_tree_remove, segs);
 		mutex_exit(&msp->ms_lock);
 
 		/*
 		 * Clear everything past what has been synced,
 		 * because we have not allocated mappings for it yet.
 		 */
 		uint64_t syncd = vdev_indirect_mapping_max_offset(vim);
 		uint64_t ms_end = msp->ms_start + msp->ms_size;
 		if (ms_end > syncd)
 			zfs_range_tree_clear(segs, syncd, ms_end - syncd);
 
 		zfs_range_tree_vacate(segs, free_mapped_segment_cb, vd);
 	}
 	zfs_range_tree_destroy(segs);
 
 	/*
 	 * Note: this must happen after we invoke free_mapped_segment_cb,
 	 * because it adds to the obsolete_segments.
 	 */
 	zfs_range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL);
 
 	ASSERT3U(vic->vic_mapping_object, ==,
 	    vdev_indirect_mapping_object(vd->vdev_indirect_mapping));
 	vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
 	vd->vdev_indirect_mapping = NULL;
 	vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx);
 	vic->vic_mapping_object = 0;
 
 	ASSERT3U(vic->vic_births_object, ==,
 	    vdev_indirect_births_object(vd->vdev_indirect_births));
 	vdev_indirect_births_close(vd->vdev_indirect_births);
 	vd->vdev_indirect_births = NULL;
 	vdev_indirect_births_free(mos, vic->vic_births_object, tx);
 	vic->vic_births_object = 0;
 
 	/*
 	 * We may have processed some frees from the removing vdev in this
 	 * txg, thus increasing svr_bytes_done; discard that here to
 	 * satisfy the assertions in spa_vdev_removal_destroy().
 	 * Note that future txg's can not have any bytes_done, because
 	 * future TXG's are only modified from open context, and we have
 	 * already shut down the copying thread.
 	 */
 	svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0;
 	spa_finish_removal(spa, DSS_CANCELED, tx);
 
 	vd->vdev_removing = B_FALSE;
 
 	if (!vdev_prop_allocating_off(vd)) {
 		spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER);
 		vdev_activate(vd);
 		spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG);
 	}
 
 	vdev_config_dirty(vd);
 
 	zfs_dbgmsg("canceled device removal for vdev %llu in %llu",
 	    (u_longlong_t)vd->vdev_id, (u_longlong_t)dmu_tx_get_txg(tx));
 	spa_history_log_internal(spa, "vdev remove canceled", tx,
 	    "%s vdev %llu %s", spa_name(spa),
 	    (u_longlong_t)vd->vdev_id,
 	    (vd->vdev_path != NULL) ? vd->vdev_path : "-");
 }
 
 static int
 spa_vdev_remove_cancel_impl(spa_t *spa)
 {
 	int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check,
 	    spa_vdev_remove_cancel_sync, NULL, 0,
 	    ZFS_SPACE_CHECK_EXTRA_RESERVED);
 	return (error);
 }
 
 int
 spa_vdev_remove_cancel(spa_t *spa)
 {
 	spa_vdev_remove_suspend(spa);
 
 	if (spa->spa_vdev_removal == NULL)
 		return (ENOTACTIVE);
 
 	return (spa_vdev_remove_cancel_impl(spa));
 }
 
 void
 svr_sync(spa_t *spa, dmu_tx_t *tx)
 {
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
 
 	if (svr == NULL)
 		return;
 
 	/*
 	 * This check is necessary so that we do not dirty the
 	 * DIRECTORY_OBJECT via spa_sync_removing_state() when there
 	 * is nothing to do.  Dirtying it every time would prevent us
 	 * from syncing-to-convergence.
 	 */
 	if (svr->svr_bytes_done[txgoff] == 0)
 		return;
 
 	/*
 	 * Update progress accounting.
 	 */
 	spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff];
 	svr->svr_bytes_done[txgoff] = 0;
 
 	spa_sync_removing_state(spa, tx);
 }
 
 static void
 vdev_remove_make_hole_and_free(vdev_t *vd)
 {
 	uint64_t id = vd->vdev_id;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	vdev_free(vd);
 
 	vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 	vdev_add_child(rvd, vd);
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a log device.  The config lock is held for the specified TXG.
  */
 static int
 spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
 	spa_t *spa = vd->vdev_spa;
 	int error = 0;
 
 	ASSERT(vd->vdev_islog);
 	ASSERT(vd == vd->vdev_top);
 	ASSERT0P(vd->vdev_log_mg);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop allocating from this vdev.
 	 */
 	metaslab_group_passivate(mg);
 
 	/*
 	 * Wait for the youngest allocations and frees to sync,
 	 * and then wait for the deferral of those frees to finish.
 	 */
 	spa_vdev_config_exit(spa, NULL,
 	    *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 	/*
 	 * Cancel any initialize or TRIM which was in progress.
 	 */
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
 	vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED);
 	vdev_autotrim_stop_wait(vd);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as
 	 * writer since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (vd->vdev_stat.vs_alloc != 0)
 		error = spa_reset_logs(spa);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
 		ASSERT0P(vd->vdev_log_mg);
 		return (error);
 	}
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	vd->vdev_removing = B_TRUE;
 
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 
 	/*
 	 * When the log space map feature is enabled we look at
 	 * the vdev's top_zap to find the on-disk flush data of
 	 * the metaslab we just flushed. Thus, while removing a
 	 * log vdev we make sure to call vdev_metaslab_fini()
 	 * first, which removes all metaslabs of this vdev from
 	 * spa_metaslabs_by_flushed before vdev_remove_empty()
 	 * destroys the top_zap of this log vdev.
 	 *
 	 * This avoids the scenario where we flush a metaslab
 	 * from the log vdev being removed that doesn't have a
 	 * top_zap and end up failing to lookup its on-disk flush
 	 * data.
 	 *
 	 * We don't call metaslab_group_destroy() right away
 	 * though (it will be called in vdev_free() later) as
 	 * during metaslab_sync() of metaslabs from other vdevs
 	 * we may touch the metaslab group of this vdev through
 	 * metaslab_class_histogram_verify()
 	 */
 	vdev_metaslab_fini(vd);
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 	*txg = spa_vdev_config_enter(spa);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/* The top ZAP should have been destroyed by vdev_remove_empty. */
 	ASSERT0(vd->vdev_top_zap);
 	/* The leaf ZAP should have been destroyed by vdev_dtl_sync. */
 	ASSERT0(vd->vdev_leaf_zap);
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	ASSERT0(vd->vdev_stat.vs_alloc);
 
 	/*
 	 * Clean up the vdev namespace.
 	 */
 	vdev_remove_make_hole_and_free(vd);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 
 	return (0);
 }
 
 static int
 spa_vdev_remove_top_check(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd != vd->vdev_top)
 		return (SET_ERROR(ENOTSUP));
 
 	if (!vdev_is_concrete(vd))
 		return (SET_ERROR(ENOTSUP));
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * This device is already being removed
 	 */
 	if (vd->vdev_removing)
 		return (SET_ERROR(EALREADY));
 
 	metaslab_class_t *mc = vd->vdev_mg->mg_class;
 	metaslab_class_t *normal = spa_normal_class(spa);
 	if (mc != normal) {
 		/*
 		 * Space allocated from the special (or dedup) class is
 		 * included in the DMU's space usage, but it's not included
 		 * in spa_dspace (or dsl_pool_adjustedsize()).  Therefore
 		 * there is always at least as much free space in the normal
 		 * class, as is allocated from the special (and dedup) class.
 		 * As a backup check, we will return ENOSPC if this is
 		 * violated. See also spa_update_dspace().
 		 */
 		uint64_t available = metaslab_class_get_space(normal) -
 		    metaslab_class_get_alloc(normal);
 		ASSERT3U(available, >=, vd->vdev_stat.vs_alloc);
 		if (available < vd->vdev_stat.vs_alloc)
 			return (SET_ERROR(ENOSPC));
 	} else if (!vd->vdev_noalloc) {
 		/* available space in the pool's normal class */
 		uint64_t available = dsl_dir_space_available(
 		    spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE);
 		if (available < vd->vdev_stat.vs_dspace)
 			return (SET_ERROR(ENOSPC));
 	}
 
 	/*
 	 * There can not be a removal in progress.
 	 */
 	if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * The device must have all its data.
 	 */
 	if (!vdev_dtl_empty(vd, DTL_MISSING) ||
 	    !vdev_dtl_empty(vd, DTL_OUTAGE))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * The device must be healthy.
 	 */
 	if (!vdev_readable(vd))
 		return (SET_ERROR(EIO));
 
 	/*
 	 * All vdevs in normal class must have the same ashift.
 	 */
 	if (spa->spa_max_ashift != spa->spa_min_ashift) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A removed special/dedup vdev must have same ashift as normal class.
 	 */
 	ASSERT(!vd->vdev_islog);
 	if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
 	    vd->vdev_ashift != spa->spa_max_ashift) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * All vdevs in normal class must have the same ashift
 	 * and not be raidz or draid.
 	 */
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (uint64_t id = 0; id < rvd->vdev_children; id++) {
 		vdev_t *cvd = rvd->vdev_child[id];
 
 		/*
 		 * A removed special/dedup vdev must have the same ashift
 		 * across all vdevs in its class.
 		 */
 		if (vd->vdev_alloc_bias != VDEV_BIAS_NONE &&
 		    cvd->vdev_alloc_bias == vd->vdev_alloc_bias &&
 		    cvd->vdev_ashift != vd->vdev_ashift) {
 			return (SET_ERROR(EINVAL));
 		}
 		if (cvd->vdev_ashift != 0 &&
 		    cvd->vdev_alloc_bias == VDEV_BIAS_NONE)
 			ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift);
 		if (!vdev_is_concrete(cvd))
 			continue;
 		if (vdev_get_nparity(cvd) != 0)
 			return (SET_ERROR(EINVAL));
 		/*
 		 * Need the mirror to be mirror of leaf vdevs only
 		 */
 		if (cvd->vdev_ops == &vdev_mirror_ops) {
 			for (uint64_t cid = 0;
 			    cid < cvd->vdev_children; cid++) {
 				if (!cvd->vdev_child[cid]->vdev_ops->
 				    vdev_op_leaf)
 					return (SET_ERROR(EINVAL));
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Initiate removal of a top-level vdev, reducing the total space in the pool.
  * The config lock is held for the specified TXG.  Once initiated,
  * evacuation of all allocated space (copying it to other vdevs) happens
  * in the background (see spa_vdev_remove_thread()), and can be canceled
  * (see spa_vdev_remove_cancel()).  If successful, the vdev will
  * be transformed to an indirect vdev (see spa_vdev_remove_complete()).
  */
 static int
 spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	boolean_t set_noalloc = B_FALSE;
 	int error;
 
 	/*
 	 * Check for errors up-front, so that we don't waste time
 	 * passivating the metaslab group and clearing the ZIL if there
 	 * are errors.
 	 */
 	error = spa_vdev_remove_top_check(vd);
 
 	/*
 	 * Stop allocating from this vdev.  Note that we must check
 	 * that this is not the only device in the pool before
 	 * passivating, otherwise we will not be able to make
 	 * progress because we can't allocate from any vdevs.
 	 * The above check for sufficient free space serves this
 	 * purpose.
 	 */
 	if (error == 0 && !vd->vdev_noalloc) {
 		set_noalloc = B_TRUE;
 		error = vdev_passivate(vd, txg);
 	}
 
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We stop any initializing and TRIM that is currently in progress
 	 * but leave the state as "active". This will allow the process to
 	 * resume if the removal is canceled sometime later.
 	 */
 
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
 	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
 	vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE);
 	vdev_autotrim_stop_wait(vd);
 
 	*txg = spa_vdev_config_enter(spa);
 
 	/*
 	 * Things might have changed while the config lock was dropped
 	 * (e.g. space usage).  Check for errors again.
 	 */
 	error = spa_vdev_remove_top_check(vd);
 
 	if (error != 0) {
 		if (set_noalloc)
 			vdev_activate(vd);
 		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 		spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 		return (error);
 	}
 
 	vd->vdev_removing = B_TRUE;
 
 	vdev_dirty_leaves(vd, VDD_DTL, *txg);
 	vdev_config_dirty(vd);
 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg);
 	dsl_sync_task_nowait(spa->spa_dsl_pool,
 	    vdev_remove_initiate_sync, (void *)(uintptr_t)vd->vdev_id, tx);
 	dmu_tx_commit(tx);
 
 	return (0);
 }
 
 /*
  * Remove a device from the pool.
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0, error_log;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 	sysevent_t *ev = NULL;
 	const char *vd_type = NULL;
 	char *vd_path = NULL;
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 
 		if (!locked)
 			return (spa_vdev_exit(spa, NULL, txg, error));
 
 		return (error);
 	}
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			const char *type;
 			boolean_t draid_spare = B_FALSE;
 
 			if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type)
 			    == 0 && strcmp(type, VDEV_TYPE_DRAID_SPARE) == 0)
 				draid_spare = B_TRUE;
 
 			if (vd == NULL && draid_spare) {
 				error = SET_ERROR(ENOTSUP);
 			} else {
 				if (vd == NULL)
 					vd = spa_lookup_by_guid(spa,
 					    guid, B_TRUE);
 				ev = spa_event_create(spa, vd, NULL,
 				    ESC_ZFS_VDEV_REMOVE_AUX);
 
 				vd_type = VDEV_TYPE_SPARE;
 				vd_path = spa_strdup(fnvlist_lookup_string(
 				    nv, ZPOOL_CONFIG_PATH));
 				spa_vdev_remove_aux(spa->spa_spares.sav_config,
 				    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 				spa_load_spares(spa);
 				spa->spa_spares.sav_sync = B_TRUE;
 			}
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		vd_type = VDEV_TYPE_L2CACHE;
 		vd_path = spa_strdup(fnvlist_lookup_string(
 		    nv, ZPOOL_CONFIG_PATH));
 		/*
 		 * Cache devices can always be removed.
 		 */
 		vd = spa_lookup_by_guid(spa, guid, B_TRUE);
 
 		/*
 		 * Stop trimming the cache device. We need to release the
 		 * config lock to allow the syncing of TRIM transactions
 		 * without releasing the spa_namespace_lock. The same
 		 * strategy is employed in spa_vdev_remove_top().
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 		mutex_enter(&vd->vdev_trim_lock);
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL);
 		mutex_exit(&vd->vdev_trim_lock);
 		txg = spa_vdev_config_enter(spa);
 
 		ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX);
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		vd_type = VDEV_TYPE_LOG;
 		vd_path = spa_strdup((vd->vdev_path != NULL) ?
 		    vd->vdev_path : "-");
 		error = spa_vdev_remove_log(vd, &txg);
 	} else if (vd != NULL) {
 		ASSERT(!locked);
 		error = spa_vdev_remove_top(vd, &txg);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = SET_ERROR(ENOENT);
 	}
 
 	error_log = error;
 
 	if (!locked)
 		error = spa_vdev_exit(spa, NULL, txg, error);
 
 	/*
 	 * Logging must be done outside the spa config lock. Otherwise,
 	 * this code path could end up holding the spa config lock while
 	 * waiting for a txg_sync so it can write to the internal log.
 	 * Doing that would prevent the txg sync from actually happening,
 	 * causing a deadlock.
 	 */
 	if (error_log == 0 && vd_type != NULL && vd_path != NULL) {
 		spa_history_log_internal(spa, "vdev remove", NULL,
 		    "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path);
 	}
 	if (vd_path != NULL)
 		spa_strfree(vd_path);
 
 	if (ev != NULL)
 		spa_event_post(ev);
 
 	return (error);
 }
 
 int
 spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs)
 {
 	prs->prs_state = spa->spa_removing_phys.sr_state;
 
 	if (prs->prs_state == DSS_NONE)
 		return (SET_ERROR(ENOENT));
 
 	prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev;
 	prs->prs_start_time = spa->spa_removing_phys.sr_start_time;
 	prs->prs_end_time = spa->spa_removing_phys.sr_end_time;
 	prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy;
 	prs->prs_copied = spa->spa_removing_phys.sr_copied;
 
 	prs->prs_mapping_memory = 0;
 	uint64_t indirect_vdev_id =
 	    spa->spa_removing_phys.sr_prev_indirect_vdev;
 	while (indirect_vdev_id != -1) {
 		vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id];
 		vdev_indirect_config_t *vic = &vd->vdev_indirect_config;
 		vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
 
 		ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops);
 		prs->prs_mapping_memory += vdev_indirect_mapping_size(vim);
 		indirect_vdev_id = vic->vic_prev_indirect_vdev;
 	}
 
 	return (0);
 }
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW,
 	"Ignore hard IO errors when removing device");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, UINT, ZMOD_RW,
 	"Largest contiguous segment to allocate when removing device");
 
 ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, UINT, ZMOD_RW,
 	"Largest span of free chunks a remap segment can span");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, UINT, ZMOD_RW,
 	"Pause device removal after this many bytes are copied "
 	"(debug use only - causes removal to hang)");
 
 EXPORT_SYMBOL(free_from_removing_vdev);
 EXPORT_SYMBOL(spa_removal_get_stats);
 EXPORT_SYMBOL(spa_remove_init);
 EXPORT_SYMBOL(spa_restart_removal);
 EXPORT_SYMBOL(spa_vdev_removal_destroy);
 EXPORT_SYMBOL(spa_vdev_remove);
 EXPORT_SYMBOL(spa_vdev_remove_cancel);
 EXPORT_SYMBOL(spa_vdev_remove_suspend);
 EXPORT_SYMBOL(svr_sync);
diff --git a/sys/contrib/openzfs/module/zfs/zfeature.c b/sys/contrib/openzfs/module/zfs/zfeature.c
index 0816ea134bf3..4cf9e0dbb405 100644
--- a/sys/contrib/openzfs/module/zfs/zfeature.c
+++ b/sys/contrib/openzfs/module/zfs/zfeature.c
@@ -1,534 +1,539 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/zfeature.h>
 #include <sys/dmu.h>
 #include <sys/nvpair.h>
 #include <sys/zap.h>
 #include <sys/dmu_tx.h>
 #include "zfeature_common.h"
 #include <sys/spa_impl.h>
 
 /*
  * ZFS Feature Flags
  * -----------------
  *
  * ZFS feature flags are used to provide fine-grained versioning to the ZFS
  * on-disk format. Once enabled on a pool feature flags replace the old
  * spa_version() number.
  *
  * Each new on-disk format change will be given a uniquely identifying string
  * GUID rather than a version number. This avoids the problem of different
  * organizations creating new on-disk formats with the same version number. To
  * keep feature GUIDs unique they should consist of the reverse dns name of the
  * organization which implemented the feature and a short name for the feature,
  * separated by a colon (e.g. com.delphix:async_destroy).
  *
  * Reference Counts
  * ----------------
  *
  * Within each pool features can be in one of three states: disabled, enabled,
  * or active. These states are differentiated by a reference count stored on
  * disk for each feature:
  *
  *   1) If there is no reference count stored on disk the feature is disabled.
  *   2) If the reference count is 0 a system administrator has enabled the
  *      feature, but the feature has not been used yet, so no on-disk
  *      format changes have been made.
  *   3) If the reference count is greater than 0 the feature is active.
  *      The format changes required by the feature are currently on disk.
  *      Note that if the feature's format changes are reversed the feature
  *      may choose to set its reference count back to 0.
  *
  * Feature flags makes no differentiation between non-zero reference counts
  * for an active feature (e.g. a reference count of 1 means the same thing as a
  * reference count of 27834721), but feature implementations may choose to use
  * the reference count to store meaningful information. For example, a new RAID
  * implementation might set the reference count to the number of vdevs using
  * it. If all those disks are removed from the pool the feature goes back to
  * having a reference count of 0.
  *
  * It is the responsibility of the individual features to maintain a non-zero
  * reference count as long as the feature's format changes are present on disk.
  *
  * Dependencies
  * ------------
  *
  * Each feature may depend on other features. The only effect of this
  * relationship is that when a feature is enabled all of its dependencies are
  * automatically enabled as well. Any future work to support disabling of
  * features would need to ensure that features cannot be disabled if other
  * enabled features depend on them.
  *
  * On-disk Format
  * --------------
  *
  * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES
  * (5000). In order for this to work the pool is automatically upgraded to
  * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk
  * format changes will be in use.
  *
  * Information about features is stored in 3 ZAP objects in the pool's MOS.
  * These objects are linked to by the following names in the pool directory
  * object:
  *
  * 1) features_for_read: feature GUID -> reference count
  *    Features needed to open the pool for reading.
  * 2) features_for_write: feature GUID -> reference count
  *    Features needed to open the pool for writing.
  * 3) feature_descriptions: feature GUID -> descriptive string
  *    A human readable string.
  *
  * All enabled features appear in either features_for_read or
  * features_for_write, but not both.
  *
  * To open a pool in read-only mode only the features listed in
  * features_for_read need to be supported.
  *
  * To open the pool in read-write mode features in both features_for_read and
  * features_for_write need to be supported.
  *
  * Some features may be required to read the ZAP objects containing feature
  * information. To allow software to check for compatibility with these features
  * before the pool is opened their names must be stored in the label in a
  * new "features_for_read" entry (note that features that are only required
  * to write to a pool never need to be stored in the label since the
  * features_for_write ZAP object can be read before the pool is written to).
  * To save space in the label features must be explicitly marked as needing to
  * be written to the label. Also, reference counts are not stored in the label,
  * instead any feature whose reference count drops to 0 is removed from the
  * label.
  *
  * Adding New Features
  * -------------------
  *
  * Features must be registered in zpool_feature_init() function in
  * zfeature_common.c using the zfeature_register() function. This function
  * has arguments to specify if the feature should be stored in the
  * features_for_read or features_for_write ZAP object and if it needs to be
  * written to the label when active.
  *
  * Once a feature is registered it will appear as a "feature@<feature name>"
  * property which can be set by an administrator. Feature implementors should
  * use the spa_feature_is_enabled() and spa_feature_is_active() functions to
  * query the state of a feature and the spa_feature_incr() and
  * spa_feature_decr() functions to change an enabled feature's reference count.
  * Reference counts may only be updated in the syncing context.
  *
  * Features may not perform enable-time initialization. Instead, any such
  * initialization should occur when the feature is first used. This design
  * enforces that on-disk changes be made only when features are used. Code
  * should only check if a feature is enabled using spa_feature_is_enabled(),
  * not by relying on any feature specific metadata existing. If a feature is
  * enabled, but the feature's metadata is not on disk yet then it should be
  * created as needed.
  *
  * As an example, consider the com.delphix:async_destroy feature. This feature
  * relies on the existence of a bptree in the MOS that store blocks for
  * asynchronous freeing. This bptree is not created when async_destroy is
  * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is
  * called to check if async_destroy is enabled. If it is and the bptree object
  * does not exist yet, the bptree object is created as part of the dataset
  * destroy and async_destroy's reference count is incremented to indicate it
  * has made an on-disk format change. Later, after the destroyed dataset's
  * blocks have all been asynchronously freed there is no longer any use for the
  * bptree object, so it is destroyed and async_destroy's reference count is
  * decremented back to 0 to indicate that it has undone its on-disk format
  * changes.
  */
 
 typedef enum {
 	FEATURE_ACTION_INCR,
 	FEATURE_ACTION_DECR,
 } feature_action_t;
 
 /*
  * Checks that the active features in the pool are supported by
  * this software.  Adds each unsupported feature (name -> description) to
  * the supplied nvlist.
  */
 boolean_t
 spa_features_check(spa_t *spa, boolean_t for_write,
     nvlist_t *unsup_feat, nvlist_t *enabled_feat)
 {
 	objset_t *os = spa->spa_meta_objset;
 	boolean_t supported;
 	zap_cursor_t *zc;
 	zap_attribute_t *za;
 	uint64_t obj = for_write ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 	char *buf;
 
 	zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	za = zap_attribute_alloc();
 	buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 	supported = B_TRUE;
 	for (zap_cursor_init(zc, os, obj);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		ASSERT(za->za_integer_length == sizeof (uint64_t) &&
 		    za->za_num_integers == 1);
 
 		if (NULL != enabled_feat) {
 			fnvlist_add_uint64(enabled_feat, za->za_name,
 			    za->za_first_integer);
 		}
 
 		if (za->za_first_integer != 0 &&
 		    !zfeature_is_supported(za->za_name)) {
 			supported = B_FALSE;
 
 			if (NULL != unsup_feat) {
 				const char *desc = "";
 
 				if (zap_lookup(os, spa->spa_feat_desc_obj,
 				    za->za_name, 1, MAXPATHLEN, buf) == 0)
 					desc = buf;
 
 				VERIFY0(nvlist_add_string(unsup_feat,
 				    za->za_name, desc));
 			}
 		}
 	}
 	zap_cursor_fini(zc);
 
 	kmem_free(buf, MAXPATHLEN);
 	zap_attribute_free(za);
 	kmem_free(zc, sizeof (zap_cursor_t));
 
 	return (supported);
 }
 
 /*
  * Use an in-memory cache of feature refcounts for quick retrieval.
  *
  * Note: well-designed features will not need to use this; they should
  * use spa_feature_is_enabled() and spa_feature_is_active() instead.
  * However, this is non-static for zdb, zhack, and spa_add_feature_stats().
  */
 int
 feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
 {
 	ASSERT(VALID_FEATURE_FID(feature->fi_feature));
 	if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
 	    SPA_FEATURE_DISABLED) {
 		return (SET_ERROR(ENOTSUP));
 	}
 	*res = spa->spa_feat_refcount_cache[feature->fi_feature];
 	return (0);
 }
 
 /*
  * Note: well-designed features will not need to use this; they should
  * use spa_feature_is_enabled() and spa_feature_is_active() instead.
  * However, this is non-static for zdb and zhack.
  */
 int
 feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
     uint64_t *res)
 {
 	int err;
 	uint64_t refcount;
 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	/*
 	 * If the pool is currently being created, the feature objects may not
 	 * have been allocated yet.  Act as though all features are disabled.
 	 */
 	if (zapobj == 0)
 		return (SET_ERROR(ENOTSUP));
 
 	err = zap_lookup(spa->spa_meta_objset, zapobj,
 	    feature->fi_guid, sizeof (uint64_t), 1, &refcount);
 	if (err != 0) {
 		if (err == ENOENT)
 			return (SET_ERROR(ENOTSUP));
 		else
 			return (err);
 	}
 	*res = refcount;
 	return (0);
 }
 
 
 static int
 feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
 {
 	uint64_t enabled_txg_obj __maybe_unused = spa->spa_feat_enabled_txg_obj;
 
 	ASSERT(zfeature_depends_on(feature->fi_feature,
 	    SPA_FEATURE_ENABLED_TXG));
 
 	if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	ASSERT(enabled_txg_obj != 0);
 
 	VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
 	    feature->fi_guid, sizeof (uint64_t), 1, res));
 
 	return (0);
 }
 
 /*
  * This function is non-static for zhack; it should otherwise not be used
  * outside this file.
  */
 void
 feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
     dmu_tx_t *tx)
 {
 	ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+	ASSERT(MUTEX_HELD(&spa->spa_feat_stats_lock));
 	VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
 	    sizeof (uint64_t), 1, &refcount, tx));
 
 	/*
 	 * feature_sync is called directly from zhack, allowing the
 	 * creation of arbitrary features whose fi_feature field may
 	 * be greater than SPA_FEATURES. When called from zhack, the
 	 * zfeature_info_t object's fi_feature field will be set to
 	 * SPA_FEATURE_NONE.
 	 */
 	if (feature->fi_feature != SPA_FEATURE_NONE) {
 		uint64_t *refcount_cache =
 		    &spa->spa_feat_refcount_cache[feature->fi_feature];
 		VERIFY3U(*refcount_cache, ==,
 		    atomic_swap_64(refcount_cache, refcount));
 	}
 
 	if (refcount == 0)
 		spa_deactivate_mos_feature(spa, feature->fi_guid);
 	else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
 		spa_activate_mos_feature(spa, feature->fi_guid, tx);
 }
 
 /*
  * This function is non-static for zhack; it should otherwise not be used
  * outside this file.
  */
 void
 feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
 {
 	uint64_t initial_refcount =
 	    (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
 	uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	ASSERT(0 != zapobj);
 	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
 
 	/*
 	 * If the feature is already enabled, ignore the request.
 	 */
 	if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
 		return;
 
 	for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
 		spa_feature_enable(spa, feature->fi_depends[i], tx);
 
 	VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
 	    feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
 	    feature->fi_desc, tx));
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	feature_sync(spa, feature, initial_refcount, tx);
+	mutex_exit(&spa->spa_feat_stats_lock);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
 		uint64_t enabling_txg = dmu_tx_get_txg(tx);
 
 		if (spa->spa_feat_enabled_txg_obj == 0ULL) {
 			spa->spa_feat_enabled_txg_obj =
 			    zap_create_link(spa->spa_meta_objset,
 			    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 			    DMU_POOL_FEATURE_ENABLED_TXG, tx);
 		}
 		spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
 
 		VERIFY0(zap_add(spa->spa_meta_objset,
 		    spa->spa_feat_enabled_txg_obj, feature->fi_guid,
 		    sizeof (uint64_t), 1, &enabling_txg, tx));
 	}
 
 	/*
 	 * Errata #4 is mostly a problem with encrypted datasets, but it
 	 * is also a problem where the old encryption feature did not
 	 * depend on the bookmark_v2 feature. If the pool does not have
 	 * any encrypted datasets we can resolve this issue simply by
 	 * enabling this dependency.
 	 */
 	if (spa->spa_errata == ZPOOL_ERRATA_ZOL_8308_ENCRYPTION &&
 	    spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
 	    !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) &&
 	    feature->fi_feature == SPA_FEATURE_BOOKMARK_V2)
 		spa->spa_errata = 0;
 
 	/*
 	 * Convert the old on-disk error log to the new format when activating
 	 * the head_errlog feature.
 	 */
 	if (feature->fi_feature == SPA_FEATURE_HEAD_ERRLOG)
 		spa_upgrade_errlog(spa, tx);
 }
 
 static void
 feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
     dmu_tx_t *tx)
 {
 	uint64_t refcount = 0;
 	zfeature_info_t *feature = &spa_feature_table[fid];
 	uint64_t zapobj __maybe_unused =
 	    (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
 	    spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
 
 	ASSERT(VALID_FEATURE_FID(fid));
 	ASSERT(0 != zapobj);
 	ASSERT(zfeature_is_valid_guid(feature->fi_guid));
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
 
+	mutex_enter(&spa->spa_feat_stats_lock);
 	VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
 
 	switch (action) {
 	case FEATURE_ACTION_INCR:
 		VERIFY3U(refcount, !=, UINT64_MAX);
 		refcount++;
 		break;
 	case FEATURE_ACTION_DECR:
 		VERIFY3U(refcount, !=, 0);
 		refcount--;
 		break;
 	default:
 		ASSERT(0);
 		break;
 	}
 
 	feature_sync(spa, feature, refcount, tx);
+	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 void
 spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx)
 {
 	/*
 	 * We create feature flags ZAP objects in two instances: during pool
 	 * creation and during pool upgrade.
 	 */
 	ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) ||
 	    dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset,
 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FEATURES_FOR_READ, tx);
 	spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset,
 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FEATURES_FOR_WRITE, tx);
 	spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset,
 	    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_FEATURE_DESCRIPTIONS, tx);
 }
 
 /*
  * Enable any required dependencies, then enable the requested feature.
  */
 void
 spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
 {
 	ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
 	ASSERT(VALID_FEATURE_FID(fid));
 	feature_enable_sync(spa, &spa_feature_table[fid], tx);
 }
 
 void
 spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
 {
 	feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
 }
 
 void
 spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
 {
 	feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
 }
 
 boolean_t
 spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
 {
 	int err;
 	uint64_t refcount = 0;
 
 	ASSERT(VALID_FEATURE_FID(fid));
 	if (spa_version(spa) < SPA_VERSION_FEATURES)
 		return (B_FALSE);
 
 	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
 	ASSERT(err == 0 || err == ENOTSUP);
 	return (err == 0);
 }
 
 boolean_t
 spa_feature_is_active(spa_t *spa, spa_feature_t fid)
 {
 	int err;
 	uint64_t refcount = 0;
 
 	ASSERT(VALID_FEATURE_FID(fid));
 	if (spa_version(spa) < SPA_VERSION_FEATURES)
 		return (B_FALSE);
 
 	err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
 	ASSERT(err == 0 || err == ENOTSUP);
 	return (err == 0 && refcount > 0);
 }
 
 /*
  * For the feature specified by fid (which must depend on
  * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
  * OUT txg argument.
  *
  * Returns B_TRUE if the feature is enabled, in which case txg will be filled
  * with the transaction group in which the specified feature was enabled.
  * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
  */
 boolean_t
 spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
 {
 	int err;
 
 	ASSERT(VALID_FEATURE_FID(fid));
 	if (spa_version(spa) < SPA_VERSION_FEATURES)
 		return (B_FALSE);
 
 	err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
 	ASSERT(err == 0 || err == ENOTSUP);
 
 	return (err == 0);
 }
diff --git a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
index 121b966b9864..76c9d4ccd51f 100644
--- a/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
+++ b/sys/contrib/openzfs/module/zfs/zfs_ioctl.c
@@ -1,8294 +1,8294 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2012 Pawel Jakub Dawidek
  * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013 Steven Hartland. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright 2017 RackTop Systems.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
  * Copyright (c) 2019 Datto Inc.
  * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved.
  * Copyright (c) 2019, 2021, 2023, 2024, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright 2024 Oxide Computer Company
  */
 
 /*
  * ZFS ioctls.
  *
  * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage
  * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool.
  *
  * There are two ways that we handle ioctls: the legacy way where almost
  * all of the logic is in the ioctl callback, and the new way where most
  * of the marshalling is handled in the common entry point, zfsdev_ioctl().
  *
  * Non-legacy ioctls should be registered by calling
  * zfs_ioctl_register() from zfs_ioctl_init().  The ioctl is invoked
  * from userland by lzc_ioctl().
  *
  * The registration arguments are as follows:
  *
  * const char *name
  *   The name of the ioctl.  This is used for history logging.  If the
  *   ioctl returns successfully (the callback returns 0), and allow_log
  *   is true, then a history log entry will be recorded with the input &
  *   output nvlists.  The log entry can be printed with "zpool history -i".
  *
  * zfs_ioc_t ioc
  *   The ioctl request number, which userland will pass to ioctl(2).
  *   We want newer versions of libzfs and libzfs_core to run against
  *   existing zfs kernel modules (i.e. a deferred reboot after an update).
  *   Therefore the ioctl numbers cannot change from release to release.
  *
  * zfs_secpolicy_func_t *secpolicy
  *   This function will be called before the zfs_ioc_func_t, to
  *   determine if this operation is permitted.  It should return EPERM
  *   on failure, and 0 on success.  Checks include determining if the
  *   dataset is visible in this zone, and if the user has either all
  *   zfs privileges in the zone (SYS_MOUNT), or has been granted permission
  *   to do this operation on this dataset with "zfs allow".
  *
  * zfs_ioc_namecheck_t namecheck
  *   This specifies what to expect in the zfs_cmd_t:zc_name -- a pool
  *   name, a dataset name, or nothing.  If the name is not well-formed,
  *   the ioctl will fail and the callback will not be called.
  *   Therefore, the callback can assume that the name is well-formed
  *   (e.g. is null-terminated, doesn't have more than one '@' character,
  *   doesn't have invalid characters).
  *
  * zfs_ioc_poolcheck_t pool_check
  *   This specifies requirements on the pool state.  If the pool does
  *   not meet them (is suspended or is readonly), the ioctl will fail
  *   and the callback will not be called.  If any checks are specified
  *   (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME.
  *   Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED |
  *   POOL_CHECK_READONLY).
  *
  * zfs_ioc_key_t *nvl_keys
  *  The list of expected/allowable innvl input keys. This list is used
  *  to validate the nvlist input to the ioctl.
  *
  * boolean_t smush_outnvlist
  *   If smush_outnvlist is true, then the output is presumed to be a
  *   list of errors, and it will be "smushed" down to fit into the
  *   caller's buffer, by removing some entries and replacing them with a
  *   single "N_MORE_ERRORS" entry indicating how many were removed.  See
  *   nvlist_smush() for details.  If smush_outnvlist is false, and the
  *   outnvlist does not fit into the userland-provided buffer, then the
  *   ioctl will fail with ENOMEM.
  *
  * zfs_ioc_func_t *func
  *   The callback function that will perform the operation.
  *
  *   The callback should return 0 on success, or an error number on
  *   failure.  If the function fails, the userland ioctl will return -1,
  *   and errno will be set to the callback's return value.  The callback
  *   will be called with the following arguments:
  *
  *   const char *name
  *     The name of the pool or dataset to operate on, from
  *     zfs_cmd_t:zc_name.  The 'namecheck' argument specifies the
  *     expected type (pool, dataset, or none).
  *
  *   nvlist_t *innvl
  *     The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src.  Or
  *     NULL if no input nvlist was provided.  Changes to this nvlist are
  *     ignored.  If the input nvlist could not be deserialized, the
  *     ioctl will fail and the callback will not be called.
  *
  *   nvlist_t *outnvl
  *     The output nvlist, initially empty.  The callback can fill it in,
  *     and it will be returned to userland by serializing it into
  *     zfs_cmd_t:zc_nvlist_dst.  If it is non-empty, and serialization
  *     fails (e.g. because the caller didn't supply a large enough
  *     buffer), then the overall ioctl will fail.  See the
  *     'smush_nvlist' argument above for additional behaviors.
  *
  *     There are two typical uses of the output nvlist:
  *       - To return state, e.g. property values.  In this case,
  *         smush_outnvlist should be false.  If the buffer was not large
  *         enough, the caller will reallocate a larger buffer and try
  *         the ioctl again.
  *
  *       - To return multiple errors from an ioctl which makes on-disk
  *         changes.  In this case, smush_outnvlist should be true.
  *         Ioctls which make on-disk modifications should generally not
  *         use the outnvl if they succeed, because the caller can not
  *         distinguish between the operation failing, and
  *         deserialization failing.
  *
  * IOCTL Interface Errors
  *
  * The following ioctl input errors can be returned:
  *   ZFS_ERR_IOC_CMD_UNAVAIL	the ioctl number is not supported by kernel
  *   ZFS_ERR_IOC_ARG_UNAVAIL	an input argument is not supported by kernel
  *   ZFS_ERR_IOC_ARG_REQUIRED	a required input argument is missing
  *   ZFS_ERR_IOC_ARG_BADTYPE	an input argument has an invalid type
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_quota.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_redact.h>
 #include <sys/dmu_tx.h>
 #include <sys/sunddi.h>
 #include <sys/policy.h>
 #include <sys/zone.h>
 #include <sys/nvpair.h>
 #include <sys/pathname.h>
 #include <sys/fs/zfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zvol.h>
 #include <sys/dsl_scan.h>
 #include <sys/fm/util.h>
 #include <sys/dsl_crypt.h>
 #include <sys/rrwlock.h>
 #include <sys/zfs_file.h>
 
 #include <sys/dmu_recv.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_recv.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zcp.h>
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_trim.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "zfs_deleg.h"
 #include "zfs_comutil.h"
 
 #include <sys/lua/lua.h>
 #include <sys/lua/lauxlib.h>
 #include <sys/zfs_ioctl_impl.h>
 
 kmutex_t zfsdev_state_lock;
 static zfsdev_state_t zfsdev_state_listhead;
 
 /*
  * Limit maximum nvlist size.  We don't want users passing in insane values
  * for zc->zc_nvlist_src_size, since we will need to allocate that much memory.
  * Defaults to 0=auto which is handled by platform code.
  */
 uint64_t zfs_max_nvlist_src_size = 0;
 
 /*
  * When logging the output nvlist of an ioctl in the on-disk history, limit
  * the logged size to this many bytes.  This must be less than DMU_MAX_ACCESS.
  * This applies primarily to zfs_ioc_channel_program().
  */
 static uint64_t zfs_history_output_max = 1024 * 1024;
 
 uint_t zfs_allow_log_key;
 
 /* DATA_TYPE_ANY is used when zkey_type can vary. */
 #define	DATA_TYPE_ANY	DATA_TYPE_UNKNOWN
 
 typedef struct zfs_ioc_vec {
 	zfs_ioc_legacy_func_t	*zvec_legacy_func;
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
 	zfs_ioc_namecheck_t	zvec_namecheck;
 	boolean_t		zvec_allow_log;
 	zfs_ioc_poolcheck_t	zvec_pool_check;
 	boolean_t		zvec_smush_outnvlist;
 	const char		*zvec_name;
 	const zfs_ioc_key_t	*zvec_nvl_keys;
 	size_t			zvec_nvl_key_count;
 } zfs_ioc_vec_t;
 
 /* This array is indexed by zfs_userquota_prop_t */
 static const char *userquota_perms[] = {
 	ZFS_DELEG_PERM_USERUSED,
 	ZFS_DELEG_PERM_USERQUOTA,
 	ZFS_DELEG_PERM_GROUPUSED,
 	ZFS_DELEG_PERM_GROUPQUOTA,
 	ZFS_DELEG_PERM_USEROBJUSED,
 	ZFS_DELEG_PERM_USEROBJQUOTA,
 	ZFS_DELEG_PERM_GROUPOBJUSED,
 	ZFS_DELEG_PERM_GROUPOBJQUOTA,
 	ZFS_DELEG_PERM_PROJECTUSED,
 	ZFS_DELEG_PERM_PROJECTQUOTA,
 	ZFS_DELEG_PERM_PROJECTOBJUSED,
 	ZFS_DELEG_PERM_PROJECTOBJQUOTA,
 };
 
 static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
 static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc);
 static int zfs_check_settable(const char *name, nvpair_t *property,
     cred_t *cr);
 static int zfs_check_clearable(const char *dataset, nvlist_t *props,
     nvlist_t **errors);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
 int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *);
 static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp);
 
 static void
 history_str_free(char *buf)
 {
 	kmem_free(buf, HIS_MAX_RECORD_LEN);
 }
 
 static char *
 history_str_get(zfs_cmd_t *zc)
 {
 	char *buf;
 
 	if (zc->zc_history == 0)
 		return (NULL);
 
 	buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
 	if (copyinstr((void *)(uintptr_t)zc->zc_history,
 	    buf, HIS_MAX_RECORD_LEN, NULL) != 0) {
 		history_str_free(buf);
 		return (NULL);
 	}
 
 	buf[HIS_MAX_RECORD_LEN -1] = '\0';
 
 	return (buf);
 }
 
 /*
  * Return non-zero if the spa version is less than requested version.
  */
 static int
 zfs_earlier_version(const char *name, int version)
 {
 	spa_t *spa;
 
 	if (spa_open(name, &spa, FTAG) == 0) {
 		if (spa_version(spa) < version) {
 			spa_close(spa, FTAG);
 			return (1);
 		}
 		spa_close(spa, FTAG);
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the ZPL version is less than requested version.
  */
 static boolean_t
 zpl_earlier_version(const char *name, int version)
 {
 	objset_t *os;
 	boolean_t rc = B_TRUE;
 
 	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		uint64_t zplversion;
 
 		if (dmu_objset_type(os) != DMU_OST_ZFS) {
 			dmu_objset_rele(os, FTAG);
 			return (B_TRUE);
 		}
 		/* XXX reading from non-owned objset */
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
 			rc = zplversion < version;
 		dmu_objset_rele(os, FTAG);
 	}
 	return (rc);
 }
 
 static void
 zfs_log_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *buf;
 
 	if ((buf = history_str_get(zc)) == NULL)
 		return;
 
 	if (spa_open(zc->zc_name, &spa, FTAG) == 0) {
 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY)
 			(void) spa_history_log(spa, buf);
 		spa_close(spa, FTAG);
 	}
 	history_str_free(buf);
 }
 
 /*
  * Policy for top-level read operations (list pools).  Requires no privileges,
  * and can be used in the local zone, as there is no associated dataset.
  */
 static int
 zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc, (void) innvl, (void) cr;
 	return (0);
 }
 
 /*
  * Policy for dataset read operations (list children, get statistics).  Requires
  * no privileges, but must be visible in the local zone.
  */
 static int
 zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl, (void) cr;
 	if (INGLOBALZONE(curproc) ||
 	    zone_dataset_visible(zc->zc_name, NULL))
 		return (0);
 
 	return (SET_ERROR(ENOENT));
 }
 
 static int
 zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr)
 {
 	int writable = 1;
 
 	/*
 	 * The dataset must be visible by this zone -- check this first
 	 * so they don't see EPERM on something they shouldn't know about.
 	 */
 	if (!INGLOBALZONE(curproc) &&
 	    !zone_dataset_visible(dataset, &writable))
 		return (SET_ERROR(ENOENT));
 
 	if (INGLOBALZONE(curproc)) {
 		/*
 		 * If the fs is zoned, only root can access it from the
 		 * global zone.
 		 */
 		if (secpolicy_zfs(cr) && zoned)
 			return (SET_ERROR(EPERM));
 	} else {
 		/*
 		 * If we are in a local zone, the 'zoned' property must be set.
 		 */
 		if (!zoned)
 			return (SET_ERROR(EPERM));
 
 		/* must be writable by this zone */
 		if (!writable)
 			return (SET_ERROR(EPERM));
 	}
 	return (0);
 }
 
 static int
 zfs_dozonecheck(const char *dataset, cred_t *cr)
 {
 	uint64_t zoned;
 
 	if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED),
 	    &zoned, NULL))
 		return (SET_ERROR(ENOENT));
 
 	return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
 zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr)
 {
 	uint64_t zoned;
 
 	if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned))
 		return (SET_ERROR(ENOENT));
 
 	return (zfs_dozonecheck_impl(dataset, zoned, cr));
 }
 
 static int
 zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds,
     const char *perm, cred_t *cr)
 {
 	int error;
 
 	error = zfs_dozonecheck_ds(name, ds, cr);
 	if (error == 0) {
 		error = secpolicy_zfs(cr);
 		if (error != 0)
 			error = dsl_deleg_access_impl(ds, perm, cr);
 	}
 	return (error);
 }
 
 static int
 zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 {
 	int error;
 	dsl_dataset_t *ds;
 	dsl_pool_t *dp;
 
 	/*
 	 * First do a quick check for root in the global zone, which
 	 * is allowed to do all write_perms.  This ensures that zfs_ioc_*
 	 * will get to handle nonexistent datasets.
 	 */
 	if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0)
 		return (0);
 
 	error = dsl_pool_hold(name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr);
 
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /*
  * Policy for setting the security label property.
  *
  * Returns 0 for success, non-zero for access and other errors.
  */
 static int
 zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr)
 {
 #ifdef HAVE_MLSLABEL
 	char		ds_hexsl[MAXNAMELEN];
 	bslabel_t	ds_sl, new_sl;
 	boolean_t	new_default = FALSE;
 	uint64_t	zoned;
 	int		needed_priv = -1;
 	int		error;
 
 	/* First get the existing dataset label. */
 	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error != 0)
 		return (SET_ERROR(EPERM));
 
 	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
 		new_default = TRUE;
 
 	/* The label must be translatable */
 	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * In a non-global zone, disallow attempts to set a label that
 	 * doesn't match that of the zone; otherwise no other checks
 	 * are needed.
 	 */
 	if (!INGLOBALZONE(curproc)) {
 		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
 			return (SET_ERROR(EPERM));
 		return (0);
 	}
 
 	/*
 	 * For global-zone datasets (i.e., those whose zoned property is
 	 * "off", verify that the specified new label is valid for the
 	 * global zone.
 	 */
 	if (dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 		return (SET_ERROR(EPERM));
 	if (!zoned) {
 		if (zfs_check_global_label(name, strval) != 0)
 			return (SET_ERROR(EPERM));
 	}
 
 	/*
 	 * If the existing dataset label is nondefault, check if the
 	 * dataset is mounted (label cannot be changed while mounted).
 	 * Get the zfsvfs_t; if there isn't one, then the dataset isn't
 	 * mounted (or isn't a dataset, doesn't exist, ...).
 	 */
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
 		objset_t *os;
 		static const char *setsl_tag = "setsl_tag";
 
 		/*
 		 * Try to own the dataset; abort if there is any error,
 		 * (e.g., already mounted, in use, or other error).
 		 */
 		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE,
 		    setsl_tag, &os);
 		if (error != 0)
 			return (SET_ERROR(EPERM));
 
 		dmu_objset_disown(os, B_TRUE, setsl_tag);
 
 		if (new_default) {
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
 			goto out_check;
 		}
 
 		if (hexstr_to_label(strval, &new_sl) != 0)
 			return (SET_ERROR(EPERM));
 
 		if (blstrictdom(&ds_sl, &new_sl))
 			needed_priv = PRIV_FILE_DOWNGRADE_SL;
 		else if (blstrictdom(&new_sl, &ds_sl))
 			needed_priv = PRIV_FILE_UPGRADE_SL;
 	} else {
 		/* dataset currently has a default label */
 		if (!new_default)
 			needed_priv = PRIV_FILE_UPGRADE_SL;
 	}
 
 out_check:
 	if (needed_priv != -1)
 		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
 	return (0);
 #else
 	return (SET_ERROR(ENOTSUP));
 #endif /* HAVE_MLSLABEL */
 }
 
 static int
 zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
     cred_t *cr)
 {
 	const char *strval;
 
 	/*
 	 * Check permissions for special properties.
 	 */
 	switch (prop) {
 	default:
 		break;
 	case ZFS_PROP_ZONED:
 		/*
 		 * Disallow setting of 'zoned' from within a local zone.
 		 */
 		if (!INGLOBALZONE(curproc))
 			return (SET_ERROR(EPERM));
 		break;
 
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (!INGLOBALZONE(curproc)) {
 			uint64_t zoned;
 			char setpoint[ZFS_MAX_DATASET_NAME_LEN];
 			/*
 			 * Unprivileged users are allowed to modify the
 			 * limit on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
 			if (dsl_prop_get_integer(dsname,
 			    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint))
 				return (SET_ERROR(EPERM));
 			if (!zoned || strlen(dsname) <= strlen(setpoint))
 				return (SET_ERROR(EPERM));
 		}
 		break;
 
 	case ZFS_PROP_MLSLABEL:
 		if (!is_system_labeled())
 			return (SET_ERROR(EPERM));
 
 		if (nvpair_value_string(propval, &strval) == 0) {
 			int err;
 
 			err = zfs_set_slabel_policy(dsname, strval, CRED());
 			if (err != 0)
 				return (err);
 		}
 		break;
 	}
 
 	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
 }
 
 static int
 zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
 	 * permission to set permissions will be evaluated later in
 	 * dsl_deleg_can_allow()
 	 */
 	(void) innvl;
 	return (zfs_dozonecheck(zc->zc_name, cr));
 }
 
 static int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
 static int
 zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	const char *cp;
 	int error;
 
 	/*
 	 * Generate the current snapshot name from the given objsetid, then
 	 * use that name for the secpolicy/zone checks.
 	 */
 	cp = strchr(zc->zc_name, '@');
 	if (cp == NULL)
 		return (SET_ERROR(EINVAL));
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_name(ds, zc->zc_name);
 
 	error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds,
 	    ZFS_DELEG_PERM_SEND, cr);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_SEND, cr));
 }
 
 static int
 zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc, (void) innvl, (void) cr;
 	return (SET_ERROR(ENOTSUP));
 }
 
 static int
 zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc, (void) innvl, (void) cr;
 	return (SET_ERROR(ENOTSUP));
 }
 
 static int
 zfs_get_parent(const char *datasetname, char *parent, int parentsize)
 {
 	char *cp;
 
 	/*
 	 * Remove the @bla or /bla from the end of the name to get the parent.
 	 */
 	(void) strlcpy(parent, datasetname, parentsize);
 	cp = strrchr(parent, '@');
 	if (cp != NULL) {
 		cp[0] = '\0';
 	} else {
 		cp = strrchr(parent, '/');
 		if (cp == NULL)
 			return (SET_ERROR(ENOENT));
 		cp[0] = '\0';
 	}
 
 	return (0);
 }
 
 int
 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
 {
 	int error;
 
 	if ((error = zfs_secpolicy_write_perms(name,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr));
 }
 
 static int
 zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
 }
 
 /*
  * Destroying snapshots with delegated permissions requires
  * descendant mount and destroy permissions.
  */
 static int
 zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc;
 	nvlist_t *snaps;
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
 
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nextpair) {
 		nextpair = nvlist_next_nvpair(snaps, pair);
 		error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
 		if (error == ENOENT) {
 			/*
 			 * Ignore any snapshots that don't exist (we consider
 			 * them "already destroyed").  Remove the name from the
 			 * nvl here in case the snapshot is created between
 			 * now and when we try to destroy it (in which case
 			 * we don't want to destroy it since we haven't
 			 * checked for permission).
 			 */
 			fnvlist_remove_nvpair(snaps, pair);
 			error = 0;
 		}
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
 	char	parentname[ZFS_MAX_DATASET_NAME_LEN];
 	int	error;
 
 	if ((error = zfs_secpolicy_write_perms(from,
 	    ZFS_DELEG_PERM_RENAME, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(from,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_get_parent(to, parentname,
 	    sizeof (parentname))) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (error);
 }
 
 static int
 zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr));
 }
 
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	dsl_pool_t *dp;
 	dsl_dataset_t *clone;
 	int error;
 
 	error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_PROMOTE, cr);
 	if (error != 0)
 		return (error);
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
 
 	if (error == 0) {
 		char parentname[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_dataset_t *origin = NULL;
 		dsl_dir_t *dd;
 		dd = clone->ds_dir;
 
 		error = dsl_dataset_hold_obj(dd->dd_pool,
 		    dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
 		if (error != 0) {
 			dsl_dataset_rele(clone, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone,
 		    ZFS_DELEG_PERM_MOUNT, cr);
 
 		dsl_dataset_name(origin, parentname);
 		if (error == 0) {
 			error = zfs_secpolicy_write_perms_ds(parentname, origin,
 			    ZFS_DELEG_PERM_PROMOTE, cr);
 		}
 		dsl_dataset_rele(clone, FTAG);
 		dsl_dataset_rele(origin, FTAG);
 	}
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 static int
 zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	int error;
 
 	/*
 	 * zfs receive -F requires full receive permission,
 	 * otherwise receive:append permission is enough
 	 */
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_RECEIVE, cr)) != 0) {
 		if (zc->zc_guid || nvlist_exists(innvl, "force"))
 			return (error);
 		if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 		    ZFS_DELEG_PERM_RECEIVE_APPEND, cr)) != 0)
 			return (error);
 	}
 
 	if ((error = zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_MOUNT, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_CREATE, cr));
 }
 
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(name,
 	    ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
 /*
  * Check for permission to create each snapshot in the nvlist.
  */
 static int
 zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc;
 	nvlist_t *snaps;
 	int error = 0;
 	nvpair_t *pair;
 
 	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
 
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		char *name = (char *)nvpair_name(pair);
 		char *atp = strchr(name, '@');
 
 		if (atp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		*atp = '\0';
 		error = zfs_secpolicy_snapshot_perms(name, cr);
 		*atp = '@';
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 /*
  * Check for permission to create each bookmark in the nvlist.
  */
 static int
 zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc;
 	int error = 0;
 
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		char *name = (char *)nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 
 		if (hashp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 		*hashp = '\0';
 		error = zfs_secpolicy_write_perms(name,
 		    ZFS_DELEG_PERM_BOOKMARK, cr);
 		*hashp = '#';
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 static int
 zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc;
 	nvpair_t *pair, *nextpair;
 	int error = 0;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nextpair) {
 		char *name = (char *)nvpair_name(pair);
 		char *hashp = strchr(name, '#');
 		nextpair = nvlist_next_nvpair(innvl, pair);
 
 		if (hashp == NULL) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		*hashp = '\0';
 		error = zfs_secpolicy_write_perms(name,
 		    ZFS_DELEG_PERM_DESTROY, cr);
 		*hashp = '#';
 		if (error == ENOENT) {
 			/*
 			 * Ignore any filesystems that don't exist (we consider
 			 * their bookmarks "already destroyed").  Remove
 			 * the name from the nvl here in case the filesystem
 			 * is created between now and when we try to destroy
 			 * the bookmark (in which case we don't want to
 			 * destroy it since we haven't checked for permission).
 			 */
 			fnvlist_remove_nvpair(innvl, pair);
 			error = 0;
 		}
 		if (error != 0)
 			break;
 	}
 
 	return (error);
 }
 
 static int
 zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc, (void) innvl, (void) cr;
 	/*
 	 * Even root must have a proper TSD so that we know what pool
 	 * to log to.
 	 */
 	if (tsd_get(zfs_allow_log_key) == NULL)
 		return (SET_ERROR(EPERM));
 	return (0);
 }
 
 static int
 zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	char		parentname[ZFS_MAX_DATASET_NAME_LEN];
 	int		error;
 	const char	*origin;
 
 	if ((error = zfs_get_parent(zc->zc_name, parentname,
 	    sizeof (parentname))) != 0)
 		return (error);
 
 	if (nvlist_lookup_string(innvl, "origin", &origin) == 0 &&
 	    (error = zfs_secpolicy_write_perms(origin,
 	    ZFS_DELEG_PERM_CLONE, cr)) != 0)
 		return (error);
 
 	if ((error = zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_CREATE, cr)) != 0)
 		return (error);
 
 	return (zfs_secpolicy_write_perms(parentname,
 	    ZFS_DELEG_PERM_MOUNT, cr));
 }
 
 /*
  * Policy for pool operations - create/destroy pools, add vdevs, etc.  Requires
  * SYS_CONFIG privilege, which is not available in a local zone.
  */
 int
 zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc, (void) innvl;
 
 	if (secpolicy_sys_config(cr, B_FALSE) != 0)
 		return (SET_ERROR(EPERM));
 
 	return (0);
 }
 
 /*
  * Policy for object to name lookups.
  */
 static int
 zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	int error;
 
 	if (secpolicy_sys_config(cr, B_FALSE) == 0)
 		return (0);
 
 	error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr);
 	return (error);
 }
 
 /*
  * Policy for fault injection.  Requires all privileges.
  */
 static int
 zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc, (void) innvl;
 	return (secpolicy_zinject(cr));
 }
 
 static int
 zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	zfs_prop_t prop = zfs_name_to_prop(zc->zc_value);
 
 	if (prop == ZPROP_USERPROP) {
 		if (!zfs_prop_user(zc->zc_value))
 			return (SET_ERROR(EINVAL));
 		return (zfs_secpolicy_write_perms(zc->zc_name,
 		    ZFS_DELEG_PERM_USERPROP, cr));
 	} else {
 		return (zfs_secpolicy_setprop(zc->zc_name, prop,
 		    NULL, cr));
 	}
 }
 
 static int
 zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	if (zc->zc_value[0] == 0) {
 		/*
 		 * They are asking about a posix uid/gid.  If it's
 		 * themself, allow it.
 		 */
 		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
 		    zc->zc_objset_type == ZFS_PROP_USERQUOTA ||
 		    zc->zc_objset_type == ZFS_PROP_USEROBJUSED ||
 		    zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) {
 			if (zc->zc_guid == crgetuid(cr))
 				return (0);
 		} else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED ||
 		    zc->zc_objset_type == ZFS_PROP_GROUPQUOTA ||
 		    zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED ||
 		    zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) {
 			if (groupmember(zc->zc_guid, cr))
 				return (0);
 		}
 		/* else is for project quota/used */
 	}
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
 static int
 zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	int err = zfs_secpolicy_read(zc, innvl, cr);
 	if (err)
 		return (err);
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    userquota_perms[zc->zc_objset_type], cr));
 }
 
 static int
 zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) innvl;
 	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
 	    NULL, cr));
 }
 
 static int
 zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc;
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int error;
 
 	holds = fnvlist_lookup_nvlist(innvl, "holds");
 
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		char fsname[ZFS_MAX_DATASET_NAME_LEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
 		error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_HOLD, cr);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 static int
 zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	(void) zc;
 	nvpair_t *pair;
 	int error;
 
 	for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(innvl, pair)) {
 		char fsname[ZFS_MAX_DATASET_NAME_LEN];
 		error = dmu_fsname(nvpair_name(pair), fsname);
 		if (error != 0)
 			return (error);
 		error = zfs_secpolicy_write_perms(fsname,
 		    ZFS_DELEG_PERM_RELEASE, cr);
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Policy for allowing temporary snapshots to be taken or released
  */
 static int
 zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	/*
 	 * A temporary snapshot is the same as a snapshot,
 	 * hold, destroy and release all rolled into one.
 	 * Delegated diff alone is sufficient that we allow this.
 	 */
 	int error;
 
 	if (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_DIFF, cr) == 0)
 		return (0);
 
 	error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr);
 
 	if (innvl != NULL) {
 		if (error == 0)
 			error = zfs_secpolicy_hold(zc, innvl, cr);
 		if (error == 0)
 			error = zfs_secpolicy_release(zc, innvl, cr);
 		if (error == 0)
 			error = zfs_secpolicy_destroy(zc, innvl, cr);
 	}
 	return (error);
 }
 
 static int
 zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_LOAD_KEY, cr));
 }
 
 static int
 zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
 {
 	return (zfs_secpolicy_write_perms(zc->zc_name,
 	    ZFS_DELEG_PERM_CHANGE_KEY, cr));
 }
 
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
 get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
 {
 	char *packed;
 	int error;
 	nvlist_t *list = NULL;
 
 	/*
 	 * Read in and unpack the user-supplied nvlist.
 	 */
 	if (size == 0)
 		return (SET_ERROR(EINVAL));
 
 	packed = vmem_alloc(size, KM_SLEEP);
 
 	if (ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag) != 0) {
 		vmem_free(packed, size);
 		return (SET_ERROR(EFAULT));
 	}
 
 	if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
 		vmem_free(packed, size);
 		return (error);
 	}
 
 	vmem_free(packed, size);
 
 	*nvp = list;
 	return (0);
 }
 
 /*
  * Reduce the size of this nvlist until it can be serialized in 'max' bytes.
  * Entries will be removed from the end of the nvlist, and one int32 entry
  * named "N_MORE_ERRORS" will be added indicating how many entries were
  * removed.
  */
 static int
 nvlist_smush(nvlist_t *errors, size_t max)
 {
 	size_t size;
 
 	size = fnvlist_size(errors);
 
 	if (size > max) {
 		nvpair_t *more_errors;
 		int n = 0;
 
 		if (max < 1024)
 			return (SET_ERROR(ENOMEM));
 
 		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0);
 		more_errors = nvlist_prev_nvpair(errors, NULL);
 
 		do {
 			nvpair_t *pair = nvlist_prev_nvpair(errors,
 			    more_errors);
 			fnvlist_remove_nvpair(errors, pair);
 			n++;
 			size = fnvlist_size(errors);
 		} while (size > max);
 
 		fnvlist_remove_nvpair(errors, more_errors);
 		fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n);
 		ASSERT3U(fnvlist_size(errors), <=, max);
 	}
 
 	return (0);
 }
 
 static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
 	char *packed = NULL;
 	int error = 0;
 	size_t size;
 
 	size = fnvlist_size(nvl);
 
 	if (size > zc->zc_nvlist_dst_size) {
 		error = SET_ERROR(ENOMEM);
 	} else {
 		packed = fnvlist_pack(nvl, &size);
 		if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    size, zc->zc_iflags) != 0)
 			error = SET_ERROR(EFAULT);
 		fnvlist_pack_free(packed, size);
 	}
 
 	zc->zc_nvlist_dst_size = size;
 	zc->zc_nvlist_dst_filled = B_TRUE;
 	return (error);
 }
 
 int
 getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp)
 {
 	int error = 0;
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	mutex_enter(&os->os_user_ptr_lock);
 	*zfvp = dmu_objset_get_user(os);
 	/* bump s_active only when non-zero to prevent umount race */
 	error = zfs_vfs_ref(zfvp);
 	mutex_exit(&os->os_user_ptr_lock);
 	return (error);
 }
 
 int
 getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(dsname, FTAG, &os);
 	if (error != 0)
 		return (error);
 
 	error = getzfsvfs_impl(os, zfvp);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
  * case its z_sb will be NULL, and it will be opened as the owner.
  * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER,
  * which prevents all inode ops from running.
  */
 static int
 zfsvfs_hold(const char *name, const void *tag, zfsvfs_t **zfvp,
     boolean_t writer)
 {
 	int error = 0;
 
 	if (getzfsvfs(name, zfvp) != 0)
 		error = zfsvfs_create(name, B_FALSE, zfvp);
 	if (error == 0) {
 		if (writer)
 			ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag);
 		else
 			ZFS_TEARDOWN_ENTER_READ(*zfvp, tag);
 		if ((*zfvp)->z_unmounted) {
 			/*
 			 * XXX we could probably try again, since the unmounting
 			 * thread should be just about to disassociate the
 			 * objset from the zfsvfs.
 			 */
 			ZFS_TEARDOWN_EXIT(*zfvp, tag);
 			return (SET_ERROR(EBUSY));
 		}
 	}
 	return (error);
 }
 
 static void
 zfsvfs_rele(zfsvfs_t *zfsvfs, const void *tag)
 {
 	ZFS_TEARDOWN_EXIT(zfsvfs, tag);
 
 	if (zfs_vfs_held(zfsvfs)) {
 		zfs_vfs_rele(zfsvfs);
 	} else {
 		dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	}
 }
 
 static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *config, *props = NULL;
 	nvlist_t *rootprops = NULL;
 	nvlist_t *zplprops = NULL;
 	dsl_crypto_params_t *dcp = NULL;
 	const char *spa_name = zc->zc_name;
 	boolean_t unload_wkey = B_TRUE;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)))
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
 
 	if (props) {
 		nvlist_t *nvl = NULL;
 		nvlist_t *hidden_args = NULL;
 		uint64_t version = SPA_VERSION;
 		const char *tname;
 
 		(void) nvlist_lookup_uint64(props,
 		    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version);
 		if (!SPA_VERSION_IS_SUPPORTED(version)) {
 			error = SET_ERROR(EINVAL);
 			goto pool_props_bad;
 		}
 		(void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl);
 		if (nvl) {
 			error = nvlist_dup(nvl, &rootprops, KM_SLEEP);
 			if (error != 0)
 				goto pool_props_bad;
 			(void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS);
 		}
 
 		(void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS,
 		    &hidden_args);
 		error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 		    rootprops, hidden_args, &dcp);
 		if (error != 0)
 			goto pool_props_bad;
 		(void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS);
 
 		VERIFY0(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP));
 		error = zfs_fill_zplprops_root(version, rootprops,
 		    zplprops, NULL);
 		if (error != 0)
 			goto pool_props_bad;
 
 		if (nvlist_lookup_string(props,
 		    zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0)
 			spa_name = tname;
 	}
 
 	error = spa_create(zc->zc_name, config, props, zplprops, dcp);
 
 	/*
 	 * Set the remaining root properties
 	 */
 	if (!error && (error = zfs_set_prop_nvlist(spa_name,
 	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) {
 		(void) spa_destroy(spa_name);
 		unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */
 	}
 
 pool_props_bad:
 	nvlist_free(rootprops);
 	nvlist_free(zplprops);
 	nvlist_free(config);
 	nvlist_free(props);
 	dsl_crypto_params_free(dcp, unload_wkey && !!error);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 {
 	int error;
 	zfs_log_history(zc);
 	error = spa_destroy(zc->zc_name);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
 	nvlist_t *config, *props = NULL;
 	uint64_t guid;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) != 0)
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
 	    guid != zc->zc_guid)
 		error = SET_ERROR(EINVAL);
 	else
 		error = spa_import(zc->zc_name, config, props, zc->zc_cookie);
 
 	if (zc->zc_nvlist_dst != 0) {
 		int err;
 
 		if ((err = put_nvlist(zc, config)) != 0)
 			error = err;
 	}
 
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
 	int error;
 	boolean_t force = (boolean_t)zc->zc_cookie;
 	boolean_t hardforce = (boolean_t)zc->zc_guid;
 
 	zfs_log_history(zc);
 	error = spa_export(zc->zc_name, NULL, force, hardforce);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_configs(zfs_cmd_t *zc)
 {
 	nvlist_t *configs;
 	int error;
 
 	error = spa_all_configs(&zc->zc_cookie, &configs);
 	if (error)
 		return (error);
 
 	error = put_nvlist(zc, configs);
 
 	nvlist_free(configs);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of the pool
  *
  * outputs:
  * zc_cookie		real errno
  * zc_nvlist_dst	config nvlist
  * zc_nvlist_dst_size	size of config nvlist
  */
 static int
 zfs_ioc_pool_stats(zfs_cmd_t *zc)
 {
 	nvlist_t *config;
 	int error;
 	int ret = 0;
 
 	error = spa_get_stats(zc->zc_name, &config, zc->zc_value,
 	    sizeof (zc->zc_value));
 
 	if (config != NULL) {
 		ret = put_nvlist(zc, config);
 		nvlist_free(config);
 
 		/*
 		 * The config may be present even if 'error' is non-zero.
 		 * In this case we return success, and preserve the real errno
 		 * in 'zc_cookie'.
 		 */
 		zc->zc_cookie = error;
 	} else {
 		ret = error;
 	}
 
 	return (ret);
 }
 
 /*
  * Try to import the given pool, returning pool stats as appropriate so that
  * user land knows which devices are available and overall pool health.
  */
 static int
 zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 {
 	nvlist_t *tryconfig, *config = NULL;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
 
 	nvlist_free(tryconfig);
 
 	if (config == NULL)
 		return (SET_ERROR(EINVAL));
 
 	error = put_nvlist(zc, config);
 	nvlist_free(config);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name              name of the pool
  * zc_cookie            scan func (pool_scan_func_t)
  * zc_flags             scrub pause/resume flag (pool_scrub_cmd_t)
  */
 static int
 zfs_ioc_pool_scan(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if (zc->zc_flags >= POOL_SCRUB_FLAGS_END)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (zc->zc_flags == POOL_SCRUB_PAUSE)
 		error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
 	else if (zc->zc_cookie == POOL_SCAN_NONE)
 		error = spa_scan_stop(spa);
 	else
 		error = spa_scan(spa, zc->zc_cookie);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * poolname             name of the pool
  * scan_type            scan func (pool_scan_func_t)
  * scan_command         scrub pause/resume flag (pool_scrub_cmd_t)
  */
 static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
 	{"scan_type",		DATA_TYPE_UINT64,	0},
 	{"scan_command",	DATA_TYPE_UINT64,	0},
 	{"scan_date_start",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"scan_date_end",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa;
 	int error;
 	uint64_t scan_type, scan_cmd;
 	uint64_t date_start, date_end;
 
 	if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
 		return (SET_ERROR(EINVAL));
 	if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (scan_cmd >= POOL_SCRUB_FLAGS_END)
 		return (SET_ERROR(EINVAL));
 
 	if (nvlist_lookup_uint64(innvl, "scan_date_start", &date_start) != 0)
 		date_start = 0;
 	if (nvlist_lookup_uint64(innvl, "scan_date_end", &date_end) != 0)
 		date_end = 0;
 
 	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
 		return (error);
 
 	if (scan_cmd == POOL_SCRUB_PAUSE) {
 		error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
 	} else if (scan_type == POOL_SCAN_NONE) {
 		error = spa_scan_stop(spa);
 	} else if (scan_cmd == POOL_SCRUB_FROM_LAST_TXG) {
 		error = spa_scan_range(spa, scan_type,
 		    spa_get_last_scrubbed_txg(spa), 0);
 	} else {
 		uint64_t txg_start, txg_end;
 
 		txg_start = txg_end = 0;
 		if (date_start != 0 || date_end != 0) {
 			mutex_enter(&spa->spa_txg_log_time_lock);
 			if (date_start != 0) {
 				txg_start = dbrrd_query(&spa->spa_txg_log_time,
 				    date_start, DBRRD_FLOOR);
 			}
 
 			if (date_end != 0) {
 				txg_end = dbrrd_query(&spa->spa_txg_log_time,
 				    date_end, DBRRD_CEILING);
 			}
 			mutex_exit(&spa->spa_txg_log_time_lock);
 		}
 
 		error = spa_scan_range(spa, scan_type, txg_start, txg_end);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		spa_freeze(spa);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 static int
 zfs_ioc_pool_upgrade(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (zc->zc_cookie < spa_version(spa) ||
 	    !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	spa_upgrade(spa, zc->zc_cookie);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	char *hist_buf;
 	uint64_t size;
 	int error;
 
 	if ((size = zc->zc_history_len) == 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	hist_buf = vmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
 		error = ddi_copyout(hist_buf,
 		    (void *)(uintptr_t)zc->zc_history,
 		    zc->zc_history_len, zc->zc_iflags);
 	}
 
 	spa_close(spa, FTAG);
 	vmem_free(hist_buf, size);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_nvlist_src	nvlist optionally containing ZPOOL_REGUID_GUID
  * zc_nvlist_src_size	size of the nvlist
  */
 static int
 zfs_ioc_pool_reguid(zfs_cmd_t *zc)
 {
 	uint64_t *guidp = NULL;
 	nvlist_t *props = NULL;
 	spa_t *spa;
 	uint64_t guid;
 	int error;
 
 	if (zc->zc_nvlist_src_size != 0) {
 		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 		    zc->zc_iflags, &props);
 		if (error != 0)
 			return (error);
 
 		error = nvlist_lookup_uint64(props, ZPOOL_REGUID_GUID, &guid);
 		if (error == 0)
 			guidp = &guid;
 		else if (error == ENOENT)
 			guidp = NULL;
 		else
 			goto out;
 	}
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error == 0) {
 		error = spa_change_guid(spa, guidp);
 		spa_close(spa, FTAG);
 	}
 
 out:
 	if (props != NULL)
 		nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 {
 	return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value));
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_obj		object to find
  *
  * outputs:
  * zc_value		name of object
  */
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	/* XXX reading from objset not owned */
 	if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
 	    FTAG, &os)) != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele_flags(os, B_TRUE, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_rele_flags(os, B_TRUE, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_obj		object to find
  *
  * outputs:
  * zc_stat		stats on object
  * zc_value		path to object
  */
 static int
 zfs_ioc_obj_to_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	/* XXX reading from objset not owned */
 	if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE,
 	    FTAG, &os)) != 0)
 		return (error);
 	if (dmu_objset_type(os) != DMU_OST_ZFS) {
 		dmu_objset_rele_flags(os, B_TRUE, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value,
 	    sizeof (zc->zc_value));
 	dmu_objset_rele_flags(os, B_TRUE, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_add(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	nvlist_t *config;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config);
 	if (error == 0) {
 		error = spa_vdev_add(spa, config, zc->zc_flags);
 		nvlist_free(config);
 	}
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of the pool
  * zc_guid		guid of vdev to remove
  * zc_cookie		cancel removal
  */
 static int
 zfs_ioc_vdev_remove(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	if (zc->zc_cookie != 0) {
 		error = spa_vdev_remove_cancel(spa);
 	} else {
 		error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE);
 	}
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 	switch (zc->zc_cookie) {
 	case VDEV_STATE_ONLINE:
 		error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate);
 		break;
 
 	case VDEV_STATE_OFFLINE:
 		error = vdev_offline(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_FAULTED:
 		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST)
 			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
 
 		error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_DEGRADED:
 		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
 		    zc->zc_obj != VDEV_AUX_EXTERNAL)
 			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
 
 		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_REMOVED:
 		error = vdev_remove_wanted(spa, zc->zc_guid);
 		break;
 
 	default:
 		error = SET_ERROR(EINVAL);
 	}
 	zc->zc_cookie = newstate;
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	nvlist_t *config;
 	int replacing = zc->zc_cookie;
 	int rebuild = zc->zc_simple;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
 		    rebuild);
 		nvlist_free(config);
 	}
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_split(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	nvlist_t *config, *props = NULL;
 	int error;
 	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &config))) {
 		spa_close(spa, FTAG);
 		return (error);
 	}
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
 	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props))) {
 		spa_close(spa, FTAG);
 		nvlist_free(config);
 		return (error);
 	}
 
 	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
 
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 	nvlist_free(props);
 
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	const char *path = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setpath(spa, guid, path);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	const char *fru = zc->zc_value;
 	uint64_t guid = zc->zc_guid;
 	int error;
 
 	error = spa_open(zc->zc_name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	error = spa_vdev_setfru(spa, guid, fru);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os)
 {
 	int error = 0;
 	nvlist_t *nv;
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (!zc->zc_simple && zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
 		 * DS_MODE_USER hold, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
 		 * XXX reading without owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent &&
 		    dmu_objset_type(os) == DMU_OST_ZVOL) {
 			error = zvol_get_stats(os, nv);
 			if (error == EIO) {
 				nvlist_free(nv);
 				return (error);
 			}
 			VERIFY0(error);
 		}
 		if (error == 0)
 			error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_objset_stats(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error == 0) {
 		error = zfs_ioc_objset_stats_impl(zc, os);
 		dmu_objset_rele(os, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_nvlist_dst	received property nvlist
  * zc_nvlist_dst_size	size of received property nvlist
  *
  * Gets received properties (distinct from local properties on or after
  * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
  * local property values.
  */
 static int
 zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
 {
 	int error = 0;
 	nvlist_t *nv;
 
 	/*
 	 * Without this check, we would return local property values if the
 	 * caller has not already received properties on or after
 	 * SPA_VERSION_RECVD_PROPS.
 	 */
 	if (!dsl_prop_get_hasrecvd(zc->zc_name))
 		return (SET_ERROR(ENOTSUP));
 
 	if (zc->zc_nvlist_dst != 0 &&
 	    (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) {
 		error = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	}
 
 	return (error);
 }
 
 static int
 nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop)
 {
 	uint64_t value;
 	int error;
 
 	/*
 	 * zfs_get_zplprop() will either find a value or give us
 	 * the default value (if there is one).
 	 */
 	if ((error = zfs_get_zplprop(os, prop, &value)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(props, zfs_prop_to_name(prop), value));
 	return (0);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_dst_size	size of buffer for zpl property nvlist
  *
  * outputs:
  * zc_nvlist_dst	zpl property nvlist
  * zc_nvlist_dst_size	size of zpl property nvlist
  */
 static int
 zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int err;
 
 	/* XXX reading without owning */
 	if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os)))
 		return (err);
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	/*
 	 * NB: nvl_add_zplprop() will read the objset contents,
 	 * which we aren't supposed to do with a DS_MODE_USER
 	 * hold, because it could be inconsistent.
 	 */
 	if (zc->zc_nvlist_dst != 0 &&
 	    !zc->zc_objset_stats.dds_inconsistent &&
 	    dmu_objset_type(os) == DMU_OST_ZFS) {
 		nvlist_t *nv;
 
 		VERIFY0(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP));
 		if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv,
 		    ZFS_PROP_DEFAULTUSERQUOTA)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv,
 		    ZFS_PROP_DEFAULTGROUPQUOTA)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv,
 		    ZFS_PROP_DEFAULTPROJECTQUOTA)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv,
 		    ZFS_PROP_DEFAULTUSEROBJQUOTA)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv,
 		    ZFS_PROP_DEFAULTGROUPOBJQUOTA)) == 0 &&
 		    (err = nvl_add_zplprop(os, nv,
 		    ZFS_PROP_DEFAULTPROJECTOBJQUOTA)) == 0)
 			err = put_nvlist(zc, nv);
 		nvlist_free(nv);
 	} else {
 		err = SET_ERROR(ENOENT);
 	}
 	dmu_objset_rele(os, FTAG);
 	return (err);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_dst_size	size of buffer for property nvlist
  *
  * outputs:
  * zc_name		name of next filesystem
  * zc_cookie		zap cursor
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 	char *p;
 	size_t orig_len = strlen(zc->zc_name);
 
 top:
 	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) {
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
 		return (error);
 	}
 
 	p = strrchr(zc->zc_name, '/');
 	if (p == NULL || p[1] != '\0')
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = SET_ERROR(ESRCH);
 	} while (error == 0 && zfs_dataset_name_hidden(zc->zc_name));
 	dmu_objset_rele(os, FTAG);
 
 	/*
 	 * If it's an internal dataset (ie. with a '$' in its name),
 	 * don't try to get stats for it, otherwise we'll return ENOENT.
 	 */
 	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
 		if (error == ENOENT) {
 			/* We lost a race with destroy, get the next one. */
 			zc->zc_name[orig_len] = '\0';
 			goto top;
 		}
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_nvlist_src	iteration range nvlist
  * zc_nvlist_src_size	size of iteration range nvlist
  *
  * outputs:
  * zc_name		name of next snapshot
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
  */
 static int
 zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 {
 	int error;
 	objset_t *os, *ossnap;
 	dsl_dataset_t *ds;
 	uint64_t min_txg = 0, max_txg = 0;
 
 	if (zc->zc_nvlist_src_size != 0) {
 		nvlist_t *props = NULL;
 		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 		    zc->zc_iflags, &props);
 		if (error != 0)
 			return (error);
 		(void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG,
 		    &min_txg);
 		(void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG,
 		    &max_txg);
 		nvlist_free(props);
 	}
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0) {
 		return (error == ENOENT ? SET_ERROR(ESRCH) : error);
 	}
 
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
 	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
 	    ZFS_MAX_DATASET_NAME_LEN) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(ESRCH));
 	}
 
 	while (error == 0) {
 		if (issig()) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 
 		error = dmu_snapshot_list_next(os,
 		    sizeof (zc->zc_name) - strlen(zc->zc_name),
 		    zc->zc_name + strlen(zc->zc_name), &zc->zc_obj,
 		    &zc->zc_cookie, NULL);
 		if (error == ENOENT) {
 			error = SET_ERROR(ESRCH);
 			break;
 		} else if (error != 0) {
 			break;
 		}
 
 		error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj,
 		    FTAG, &ds);
 		if (error != 0)
 			break;
 
 		if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) ||
 		    (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) {
 			dsl_dataset_rele(ds, FTAG);
 			/* undo snapshot name append */
 			*(strchr(zc->zc_name, '@') + 1) = '\0';
 			/* skip snapshot */
 			continue;
 		}
 
 		if (zc->zc_simple) {
 			dsl_dataset_fast_stat(ds, &zc->zc_objset_stats);
 			dsl_dataset_rele(ds, FTAG);
 			break;
 		}
 
 		if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			break;
 		}
 		if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) {
 			dsl_dataset_rele(ds, FTAG);
 			break;
 		}
 		dsl_dataset_rele(ds, FTAG);
 		break;
 	}
 
 	dmu_objset_rele(os, FTAG);
 	/* if we failed, undo the @ that we tacked on to zc_name */
 	if (error != 0)
 		*strchr(zc->zc_name, '@') = '\0';
 	return (error);
 }
 
 static int
 zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
 {
 	const char *propname = nvpair_name(pair);
 	uint64_t *valary;
 	unsigned int vallen;
 	const char *dash, *domain;
 	zfs_userquota_prop_t type;
 	uint64_t rid;
 	uint64_t quota;
 	zfsvfs_t *zfsvfs;
 	int err;
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY0(nvpair_value_nvlist(pair, &attrs));
 		if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 		    &pair) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A correctly constructed propname is encoded as
 	 * userquota@<rid>-<domain>.
 	 */
 	if ((dash = strchr(propname, '-')) == NULL ||
 	    nvpair_value_uint64_array(pair, &valary, &vallen) != 0 ||
 	    vallen != 3)
 		return (SET_ERROR(EINVAL));
 
 	domain = dash + 1;
 	type = valary[0];
 	rid = valary[1];
 	quota = valary[2];
 
 	err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE);
 	if (err == 0) {
 		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
 		zfsvfs_rele(zfsvfs, FTAG);
 	}
 
 	return (err);
 }
 
 /*
  * If the named property is one that has a special function to set its value,
  * return 0 on success and a positive error code on failure; otherwise if it is
  * not one of the special properties handled by this function, return -1.
  *
  * XXX: It would be better for callers of the property interface if we handled
  * these special cases in dsl_prop.c (in the dsl layer).
  */
 static int
 zfs_prop_set_special(const char *dsname, zprop_source_t source,
     nvpair_t *pair)
 {
 	const char *propname = nvpair_name(pair);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval = 0;
 	const char *strval = NULL;
 	int err = -1;
 
 	if (prop == ZPROP_USERPROP) {
 		if (zfs_prop_userquota(propname))
 			return (zfs_prop_set_userquota(dsname, pair));
 		return (-1);
 	}
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY0(nvpair_value_nvlist(pair, &attrs));
 		VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair));
 	}
 
 	/* all special properties are numeric except for keylocation */
 	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
 		strval = fnvpair_value_string(pair);
 	} else {
 		intval = fnvpair_value_uint64(pair);
 	}
 
 	switch (prop) {
 	case ZFS_PROP_QUOTA:
 		err = dsl_dir_set_quota(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFQUOTA:
 		err = dsl_dataset_set_refquota(dsname, source, intval);
 		break;
 	case ZFS_PROP_FILESYSTEM_LIMIT:
 	case ZFS_PROP_SNAPSHOT_LIMIT:
 		if (intval == UINT64_MAX) {
 			/* clearing the limit, just do it */
 			err = 0;
 		} else {
 			err = dsl_dir_activate_fs_ss_limit(dsname);
 		}
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_KEYLOCATION:
 		err = dsl_crypto_can_set_keylocation(dsname, strval);
 
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_RESERVATION:
 		err = dsl_dir_set_reservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_REFRESERVATION:
 		err = dsl_dataset_set_refreservation(dsname, source, intval);
 		break;
 	case ZFS_PROP_COMPRESSION:
 		err = dsl_dataset_set_compression(dsname, source, intval);
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_VOLSIZE:
 		err = zvol_set_volsize(dsname, intval);
 		break;
 	case ZFS_PROP_VOLTHREADING:
 		err = zvol_set_volthreading(dsname, intval);
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_SNAPDEV:
 	case ZFS_PROP_VOLMODE:
 		err = zvol_set_common(dsname, prop, source, intval);
 		break;
 	case ZFS_PROP_READONLY:
 		err = zvol_set_ro(dsname, intval);
 		/*
 		 * Set err to -1 to force the zfs_set_prop_nvlist code down the
 		 * default path to set the value in the nvlist.
 		 */
 		if (err == 0)
 			err = -1;
 		break;
 	case ZFS_PROP_VERSION:
 	{
 		zfsvfs_t *zfsvfs;
 
 		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
 			break;
 
 		err = zfs_set_version(zfsvfs, intval);
 		zfsvfs_rele(zfsvfs, FTAG);
 
 		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
 			zfs_cmd_t *zc;
 
 			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 			(void) strlcpy(zc->zc_name, dsname,
 			    sizeof (zc->zc_name));
 			(void) zfs_ioc_userspace_upgrade(zc);
 			(void) zfs_ioc_id_quota_upgrade(zc);
 			kmem_free(zc, sizeof (zfs_cmd_t));
 		}
 		break;
 	}
 	case ZFS_PROP_LONGNAME:
 	{
 		zfsvfs_t *zfsvfs;
 
 		/*
 		 * Ignore the checks if the property is being applied as part of
 		 * 'zfs receive'. Because, we already check if the local pool
 		 * has SPA_FEATURE_LONGNAME enabled in dmu_recv_begin_check().
 		 */
 		if (source == ZPROP_SRC_RECEIVED) {
 			cmn_err(CE_NOTE, "Skipping ZFS_PROP_LONGNAME checks "
 			    "for dsname=%s\n", dsname);
 			err = -1;
 			break;
 		}
 
 		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE)) != 0) {
 			cmn_err(CE_WARN, "%s:%d Failed to hold for dsname=%s "
 			    "err=%d\n", __FILE__, __LINE__, dsname, err);
 			break;
 		}
 
 		if (!spa_feature_is_enabled(zfsvfs->z_os->os_spa,
 		    SPA_FEATURE_LONGNAME)) {
 			err = ENOTSUP;
 		} else {
 			/*
 			 * Set err to -1 to force the zfs_set_prop_nvlist code
 			 * down the default path to set the value in the nvlist.
 			 */
 			err = -1;
 		}
 		zfsvfs_rele(zfsvfs, FTAG);
 		break;
 	}
 	case ZFS_PROP_DEFAULTUSERQUOTA:
 	case ZFS_PROP_DEFAULTGROUPQUOTA:
 	case ZFS_PROP_DEFAULTPROJECTQUOTA:
 	case ZFS_PROP_DEFAULTUSEROBJQUOTA:
 	case ZFS_PROP_DEFAULTGROUPOBJQUOTA:
 	case ZFS_PROP_DEFAULTPROJECTOBJQUOTA:
 	{
 		zfsvfs_t *zfsvfs;
 		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0)
 			break;
 		err = zfs_set_default_quota(zfsvfs, prop, intval);
 		zfsvfs_rele(zfsvfs, FTAG);
 		break;
 	}
 	default:
 		err = -1;
 	}
 
 	return (err);
 }
 
 static boolean_t
 zfs_is_namespace_prop(zfs_prop_t prop)
 {
 	switch (prop) {
 
 	case ZFS_PROP_ATIME:
 	case ZFS_PROP_RELATIME:
 	case ZFS_PROP_DEVICES:
 	case ZFS_PROP_EXEC:
 	case ZFS_PROP_SETUID:
 	case ZFS_PROP_READONLY:
 	case ZFS_PROP_XATTR:
 	case ZFS_PROP_NBMAND:
 		return (B_TRUE);
 
 	default:
 		return (B_FALSE);
 	}
 }
 
 /*
  * This function is best effort. If it fails to set any of the given properties,
  * it continues to set as many as it can and returns the last error
  * encountered. If the caller provides a non-NULL errlist, it will be filled in
  * with the list of names of all the properties that failed along with the
  * corresponding error numbers.
  *
  * If every property is set successfully, zero is returned and errlist is not
  * modified.
  */
 int
 zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
     nvlist_t *errlist)
 {
 	nvpair_t *pair;
 	nvpair_t *propval;
 	int rv = 0;
 	int err;
 	uint64_t intval;
 	const char *strval;
 	boolean_t should_update_mount_cache = B_FALSE;
 
 	nvlist_t *genericnvl = fnvlist_alloc();
 	nvlist_t *retrynvl = fnvlist_alloc();
 retry:
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		err = 0;
 
 		/* decode the property value */
 		propval = pair;
 		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 			nvlist_t *attrs;
 			attrs = fnvpair_value_nvlist(pair);
 			if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
 			    &propval) != 0)
 				err = SET_ERROR(EINVAL);
 		}
 
 		/* Validate value type */
 		if (err == 0 && source == ZPROP_SRC_INHERITED) {
 			/* inherited properties are expected to be booleans */
 			if (nvpair_type(propval) != DATA_TYPE_BOOLEAN)
 				err = SET_ERROR(EINVAL);
 		} else if (err == 0 && prop == ZPROP_USERPROP) {
 			if (zfs_prop_user(propname)) {
 				if (nvpair_type(propval) != DATA_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
 			} else if (zfs_prop_userquota(propname)) {
 				if (nvpair_type(propval) !=
 				    DATA_TYPE_UINT64_ARRAY)
 					err = SET_ERROR(EINVAL);
 			} else {
 				err = SET_ERROR(EINVAL);
 			}
 		} else if (err == 0) {
 			if (nvpair_type(propval) == DATA_TYPE_STRING) {
 				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
 					err = SET_ERROR(EINVAL);
 			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
 				const char *unused;
 
 				intval = fnvpair_value_uint64(propval);
 
 				switch (zfs_prop_get_type(prop)) {
 				case PROP_TYPE_NUMBER:
 					break;
 				case PROP_TYPE_STRING:
 					err = SET_ERROR(EINVAL);
 					break;
 				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
 						err =
 						    SET_ERROR(ZFS_ERR_BADPROP);
 					break;
 				default:
 					cmn_err(CE_PANIC,
 					    "unknown property type");
 				}
 			} else {
 				err = SET_ERROR(EINVAL);
 			}
 		}
 
 		/* Validate permissions */
 		if (err == 0)
 			err = zfs_check_settable(dsname, pair, CRED());
 
 		if (err == 0) {
 			if (source == ZPROP_SRC_INHERITED)
 				err = -1; /* does not need special handling */
 			else
 				err = zfs_prop_set_special(dsname, source,
 				    pair);
 			if (err == -1) {
 				/*
 				 * For better performance we build up a list of
 				 * properties to set in a single transaction.
 				 */
 				err = nvlist_add_nvpair(genericnvl, pair);
 			} else if (err != 0 && nvl != retrynvl) {
 				/*
 				 * This may be a spurious error caused by
 				 * receiving quota and reservation out of order.
 				 * Try again in a second pass.
 				 */
 				err = nvlist_add_nvpair(retrynvl, pair);
 			}
 		}
 
 		if (err != 0) {
 			if (errlist != NULL)
 				fnvlist_add_int32(errlist, propname, err);
 			rv = err;
 		}
 
 		if (zfs_is_namespace_prop(prop))
 			should_update_mount_cache = B_TRUE;
 	}
 
 	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
 		nvl = retrynvl;
 		goto retry;
 	}
 
 	if (nvlist_empty(genericnvl))
 		goto out;
 
 	/*
 	 * Try to set them all in one batch.
 	 */
 	err = dsl_props_set(dsname, source, genericnvl);
 	if (err == 0)
 		goto out;
 
 	/*
 	 * If batching fails, we still want to set as many properties as we
 	 * can, so try setting them individually.
 	 */
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 
 		propval = pair;
 		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 			nvlist_t *attrs;
 			attrs = fnvpair_value_nvlist(pair);
 			propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE);
 		}
 
 		if (nvpair_type(propval) == DATA_TYPE_STRING) {
 			strval = fnvpair_value_string(propval);
 			err = dsl_prop_set_string(dsname, propname,
 			    source, strval);
 		} else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) {
 			err = dsl_prop_inherit(dsname, propname, source);
 		} else {
 			intval = fnvpair_value_uint64(propval);
 			err = dsl_prop_set_int(dsname, propname, source,
 			    intval);
 		}
 
 		if (err != 0) {
 			if (errlist != NULL) {
 				fnvlist_add_int32(errlist, propname, err);
 			}
 			rv = err;
 		}
 	}
 
 out:
 	if (should_update_mount_cache)
 		zfs_ioctl_update_mount_cache(dsname);
 
 	nvlist_free(genericnvl);
 	nvlist_free(retrynvl);
 
 	return (rv);
 }
 
 /*
  * Check that all the properties are valid user properties.
  */
 static int
 zfs_check_userprops(nvlist_t *nvl)
 {
 	nvpair_t *pair = NULL;
 
 	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
 		const char *propname = nvpair_name(pair);
 
 		if (!zfs_prop_user(propname) ||
 		    nvpair_type(pair) != DATA_TYPE_STRING)
 			return (SET_ERROR(EINVAL));
 
 		if (strlen(propname) >= ZAP_MAXNAMELEN)
 			return (SET_ERROR(ENAMETOOLONG));
 
 		if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
 			return (SET_ERROR(E2BIG));
 	}
 	return (0);
 }
 
 static void
 props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
 {
 	nvpair_t *pair;
 
 	VERIFY0(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP));
 
 	pair = NULL;
 	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
 		if (nvlist_exists(skipped, nvpair_name(pair)))
 			continue;
 
 		VERIFY0(nvlist_add_nvpair(*newprops, pair));
 	}
 }
 
 static int
 clear_received_props(const char *dsname, nvlist_t *props,
     nvlist_t *skipped)
 {
 	int err = 0;
 	nvlist_t *cleared_props = NULL;
 	props_skip(props, skipped, &cleared_props);
 	if (!nvlist_empty(cleared_props)) {
 		/*
 		 * Acts on local properties until the dataset has received
 		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
 		 */
 		zprop_source_t flags = (ZPROP_SRC_NONE |
 		    (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0));
 		err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL);
 	}
 	nvlist_free(cleared_props);
 	return (err);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to set
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_cookie		received properties flag
  *
  * outputs:
  * zc_nvlist_dst{_size} error for each unapplied received property
  */
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
 	    ZPROP_SRC_LOCAL);
 	nvlist_t *errors;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &nvl)) != 0)
 		return (error);
 
 	if (received) {
 		nvlist_t *origprops;
 
 		if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) {
 			(void) clear_received_props(zc->zc_name,
 			    origprops, nvl);
 			nvlist_free(origprops);
 		}
 
 		error = dsl_prop_set_hasrecvd(zc->zc_name);
 	}
 
 	errors = fnvlist_alloc();
 	if (error == 0)
 		error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors);
 
 	if (zc->zc_nvlist_dst != 0 && errors != NULL) {
 		(void) put_nvlist(zc, errors);
 	}
 
 	nvlist_free(errors);
 	nvlist_free(nvl);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to inherit
  * zc_cookie		revert to received value if TRUE
  *
  * outputs:		none
  */
 static int
 zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 {
 	const char *propname = zc->zc_value;
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	boolean_t received = zc->zc_cookie;
 	zprop_source_t source = (received
 	    ? ZPROP_SRC_NONE		/* revert to received value, if any */
 	    : ZPROP_SRC_INHERITED);	/* explicitly inherit */
 	nvlist_t *dummy;
 	nvpair_t *pair;
 	zprop_type_t type;
 	int err;
 
 	if (!received) {
 		/*
 		 * Only check this in the non-received case. We want to allow
 		 * 'inherit -S' to revert non-inheritable properties like quota
 		 * and reservation to the received or default values even though
 		 * they are not considered inheritable.
 		 */
 		if (prop != ZPROP_USERPROP && !zfs_prop_inheritable(prop))
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (prop == ZPROP_USERPROP) {
 		if (!zfs_prop_user(propname))
 			return (SET_ERROR(EINVAL));
 
 		type = PROP_TYPE_STRING;
 	} else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) {
 		return (SET_ERROR(EINVAL));
 	} else {
 		type = zfs_prop_get_type(prop);
 	}
 
 	/*
 	 * zfs_prop_set_special() expects properties in the form of an
 	 * nvpair with type info.
 	 */
 	dummy = fnvlist_alloc();
 
 	switch (type) {
 	case PROP_TYPE_STRING:
 		VERIFY0(nvlist_add_string(dummy, propname, ""));
 		break;
 	case PROP_TYPE_NUMBER:
 	case PROP_TYPE_INDEX:
 		VERIFY0(nvlist_add_uint64(dummy, propname, 0));
 		break;
 	default:
 		err = SET_ERROR(EINVAL);
 		goto errout;
 	}
 
 	pair = nvlist_next_nvpair(dummy, NULL);
 	if (pair == NULL) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		err = zfs_prop_set_special(zc->zc_name, source, pair);
 		if (err == -1) /* property is not "special", needs handling */
 			err = dsl_prop_inherit(zc->zc_name, zc->zc_value,
 			    source);
 	}
 
 errout:
 	nvlist_free(dummy);
 	return (err);
 }
 
 static int
 zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 {
 	nvlist_t *props;
 	spa_t *spa;
 	int error;
 	nvpair_t *pair;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &props)))
 		return (error);
 
 	/*
 	 * If the only property is the configfile, then just do a spa_lookup()
 	 * to handle the faulted case.
 	 */
 	pair = nvlist_next_nvpair(props, NULL);
 	if (pair != NULL && strcmp(nvpair_name(pair),
 	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
 	    nvlist_next_nvpair(props, pair) == NULL) {
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
 			spa_configfile_set(spa, props, B_FALSE);
 			spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		}
 		mutex_exit(&spa_namespace_lock);
 		if (spa != NULL) {
 			nvlist_free(props);
 			return (0);
 		}
 	}
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(props);
 		return (error);
 	}
 
 	error = spa_prop_set(spa, props);
 
 	nvlist_free(props);
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * innvl: {
  *	"get_props_names": [ "prop1", "prop2", ..., "propN" ]
  * }
  */
 
 static const zfs_ioc_key_t zfs_keys_get_props[] = {
 	{ ZPOOL_GET_PROPS_NAMES,	DATA_TYPE_STRING_ARRAY,	ZK_OPTIONAL },
 };
 
 static int
 zfs_ioc_pool_get_props(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa;
 	char **props = NULL;
 	unsigned int n_props = 0;
 	int error;
 
 	if (nvlist_lookup_string_array(innvl, ZPOOL_GET_PROPS_NAMES,
 	    &props, &n_props) != 0) {
 		props = NULL;
 	}
 
 	if ((error = spa_open(pool, &spa, FTAG)) != 0) {
 		/*
 		 * If the pool is faulted, there may be properties we can still
 		 * get (such as altroot and cachefile), so attempt to get them
 		 * anyway.
 		 */
 		mutex_enter(&spa_namespace_lock);
 		if ((spa = spa_lookup(pool)) != NULL) {
 			error = spa_prop_get(spa, outnvl);
 			if (error == 0 && props != NULL)
 				error = spa_prop_get_nvlist(spa, props, n_props,
 				    outnvl);
 		}
 		mutex_exit(&spa_namespace_lock);
 	} else {
 		error = spa_prop_get(spa, outnvl);
 		if (error == 0 && props != NULL)
 			error = spa_prop_get_nvlist(spa, props, n_props,
 			    outnvl);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * innvl: {
  *     "vdevprops_set_vdev" -> guid
  *     "vdevprops_set_props" -> { prop -> value }
  * }
  *
  * outnvl: propname -> error code (int32)
  */
 static const zfs_ioc_key_t zfs_keys_vdev_set_props[] = {
 	{ZPOOL_VDEV_PROPS_SET_VDEV,	DATA_TYPE_UINT64,	0},
 	{ZPOOL_VDEV_PROPS_SET_PROPS,	DATA_TYPE_NVLIST,	0}
 };
 
 static int
 zfs_ioc_vdev_set_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa;
 	int error;
 	vdev_t *vd;
 	uint64_t vdev_guid;
 
 	/* Early validation */
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (outnvl == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
 		return (error);
 
 	ASSERT(spa_writeable(spa));
 
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	error = vdev_prop_set(vd, innvl, outnvl);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * innvl: {
  *     "vdevprops_get_vdev" -> guid
  *     (optional) "vdevprops_get_props" -> { propname -> propid }
  * }
  *
  * outnvl: propname -> value
  */
 static const zfs_ioc_key_t zfs_keys_vdev_get_props[] = {
 	{ZPOOL_VDEV_PROPS_GET_VDEV,	DATA_TYPE_UINT64,	0},
 	{ZPOOL_VDEV_PROPS_GET_PROPS,	DATA_TYPE_NVLIST,	ZK_OPTIONAL}
 };
 
 static int
 zfs_ioc_vdev_get_props(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa;
 	int error;
 	vdev_t *vd;
 	uint64_t vdev_guid;
 
 	/* Early validation */
 	if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
 	    &vdev_guid) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (outnvl == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = spa_open(poolname, &spa, FTAG)) != 0)
 		return (error);
 
 	if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	error = vdev_prop_get(vd, innvl, outnvl);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_nvlist_src{_size}	nvlist of delegated permissions
  * zc_perm_action	allow/unallow flag
  *
  * outputs:		none
  */
 static int
 zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 {
 	int error;
 	nvlist_t *fsaclnv = NULL;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &fsaclnv)) != 0)
 		return (error);
 
 	/*
 	 * Verify nvlist is constructed correctly
 	 */
 	if (zfs_deleg_verify_nvlist(fsaclnv) != 0) {
 		nvlist_free(fsaclnv);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we don't have PRIV_SYS_MOUNT, then validate
 	 * that user is allowed to hand out each permission in
 	 * the nvlist(s)
 	 */
 
 	error = secpolicy_zfs(CRED());
 	if (error != 0) {
 		if (zc->zc_perm_action == B_FALSE) {
 			error = dsl_deleg_can_allow(zc->zc_name,
 			    fsaclnv, CRED());
 		} else {
 			error = dsl_deleg_can_unallow(zc->zc_name,
 			    fsaclnv, CRED());
 		}
 	}
 
 	if (error == 0)
 		error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action);
 
 	nvlist_free(fsaclnv);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * zc_nvlist_src{_size}	nvlist of delegated permissions
  */
 static int
 zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 {
 	nvlist_t *nvp;
 	int error;
 
 	if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) {
 		error = put_nvlist(zc, nvp);
 		nvlist_free(nvp);
 	}
 
 	return (error);
 }
 
 static void
 zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 
 	zfs_create_fs(os, cr, zct->zct_zplprops, tx);
 }
 
 #define	ZFS_PROP_UNDEFINED	((uint64_t)-1)
 
 /*
  * inputs:
  * os			parent objset pointer (NULL if root fs)
  * fuids_ok		fuids allowed in this version of the spa?
  * sa_ok		SAs allowed in this version of the spa?
  * createprops		list of properties requested by creator
  *
  * outputs:
  * zplprops	values for the zplprops we attach to the master node object
  * is_ci	true if requested file system will be purely case-insensitive
  *
  * Determine the settings for utf8only, normalization and
  * casesensitivity.  Specific values may have been requested by the
  * creator and/or we can inherit values from the parent dataset.  If
  * the file system is of too early a vintage, a creator can not
  * request settings for these properties, even if the requested
  * setting is the default value.  We don't actually want to create dsl
  * properties for these, so remove them from the source nvlist after
  * processing.
  */
 static int
 zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
     boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
 	uint64_t duq = ZFS_PROP_UNDEFINED, duoq = ZFS_PROP_UNDEFINED;
 	uint64_t dgq = ZFS_PROP_UNDEFINED, dgoq = ZFS_PROP_UNDEFINED;
 	uint64_t dpq = ZFS_PROP_UNDEFINED, dpoq = ZFS_PROP_UNDEFINED;
 	int error;
 
 	ASSERT(zplprops != NULL);
 
 	/* parent dataset must be a filesystem */
 	if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS)
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 
 	/*
 	 * Pull out creator prop choices, if any.
 	 */
 	if (createprops) {
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_VERSION), &zplver);
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_NORMALIZE));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_UTF8ONLY));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_CASE), &sense);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_CASE));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTUSERQUOTA), &duq);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTUSERQUOTA));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPQUOTA), &dgq);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPQUOTA));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTQUOTA), &dpq);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTQUOTA));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTUSEROBJQUOTA), &duoq);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTUSEROBJQUOTA));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPOBJQUOTA), &dgoq);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPOBJQUOTA));
 		(void) nvlist_lookup_uint64(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTOBJQUOTA), &dpoq);
 		(void) nvlist_remove_all(createprops,
 		    zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTOBJQUOTA));
 	}
 
 	/*
 	 * If the zpl version requested is whacky or the file system
 	 * or pool is version is too "young" to support normalization
 	 * and the creator tried to set a value for one of the props,
 	 * error out.
 	 */
 	if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) ||
 	    (zplver >= ZPL_VERSION_FUID && !fuids_ok) ||
 	    (zplver >= ZPL_VERSION_SA && !sa_ok) ||
 	    (zplver < ZPL_VERSION_NORMALIZATION &&
 	    (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED ||
 	    sense != ZFS_PROP_UNDEFINED)))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Put the version in the zplprops
 	 */
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_VERSION), zplver));
 
 	if (norm == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm));
 
 	/*
 	 * If we're normalizing, names must always be valid UTF-8 strings.
 	 */
 	if (norm)
 		u8 = 1;
 	if (u8 == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8));
 
 	if (sense == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_CASE), sense));
 
 	if (duq == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSERQUOTA, &duq)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_DEFAULTUSERQUOTA), duq));
 
 	if (dgq == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPQUOTA,
 	    &dgq)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPQUOTA), dgq));
 
 	if (dpq == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTQUOTA,
 	    &dpq)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTQUOTA), dpq));
 
 	if (duoq == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTUSEROBJQUOTA,
 	    &duoq)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_DEFAULTUSEROBJQUOTA), duoq));
 
 	if (dgoq == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTGROUPOBJQUOTA,
 	    &dgoq)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_DEFAULTGROUPOBJQUOTA), dgoq));
 
 	if (dpoq == ZFS_PROP_UNDEFINED &&
 	    (error = zfs_get_zplprop(os, ZFS_PROP_DEFAULTPROJECTOBJQUOTA,
 	    &dpoq)) != 0)
 		return (error);
 	VERIFY0(nvlist_add_uint64(zplprops,
 	    zfs_prop_to_name(ZFS_PROP_DEFAULTPROJECTOBJQUOTA), dpoq));
 
 	if (is_ci)
 		*is_ci = (sense == ZFS_CASE_INSENSITIVE);
 
 	return (0);
 }
 
 static int
 zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	boolean_t fuids_ok, sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	objset_t *os = NULL;
 	char parentname[ZFS_MAX_DATASET_NAME_LEN];
 	spa_t *spa;
 	uint64_t spa_vers;
 	int error;
 
 	zfs_get_parent(dataset, parentname, sizeof (parentname));
 
 	if ((error = spa_open(dataset, &spa, FTAG)) != 0)
 		return (error);
 
 	spa_vers = spa_version(spa);
 	spa_close(spa, FTAG);
 
 	zplver = zfs_zpl_version_map(spa_vers);
 	fuids_ok = (zplver >= ZPL_VERSION_FUID);
 	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	/*
 	 * Open parent object set so we can inherit zplprop values.
 	 */
 	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
 		return (error);
 
 	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops,
 	    zplprops, is_ci);
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 static int
 zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops,
     nvlist_t *zplprops, boolean_t *is_ci)
 {
 	boolean_t fuids_ok;
 	boolean_t sa_ok;
 	uint64_t zplver = ZPL_VERSION;
 	int error;
 
 	zplver = zfs_zpl_version_map(spa_vers);
 	fuids_ok = (zplver >= ZPL_VERSION_FUID);
 	sa_ok = (zplver >= ZPL_VERSION_SA);
 
 	error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok,
 	    createprops, zplprops, is_ci);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "type" -> dmu_objset_type_t (int32)
  *     (optional) "props" -> { prop -> value }
  *     (optional) "hidden_args" -> { "wkeydata" -> value }
  *         raw uint8_t array of encryption wrapping key data (32 bytes)
  * }
  *
  * outnvl: propname -> error code (int32)
  */
 
 static const zfs_ioc_key_t zfs_keys_create[] = {
 	{"type",	DATA_TYPE_INT32,	0},
 	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error = 0;
 	zfs_creat_t zct = { 0 };
 	nvlist_t *nvprops = NULL;
 	nvlist_t *hidden_args = NULL;
 	void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	dmu_objset_type_t type;
 	boolean_t is_insensitive = B_FALSE;
 	dsl_crypto_params_t *dcp = NULL;
 
 	type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type");
 	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 	(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
 
 	switch (type) {
 	case DMU_OST_ZFS:
 		cbfunc = zfs_create_cb;
 		break;
 
 	case DMU_OST_ZVOL:
 		cbfunc = zvol_create_cb;
 		break;
 
 	default:
 		cbfunc = NULL;
 		break;
 	}
 	if (strchr(fsname, '@') ||
 	    strchr(fsname, '%'))
 		return (SET_ERROR(EINVAL));
 
 	zct.zct_props = nvprops;
 
 	if (cbfunc == NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (type == DMU_OST_ZVOL) {
 		uint64_t volsize, volblocksize;
 
 		if (nvprops == NULL)
 			return (SET_ERROR(EINVAL));
 		if (nvlist_lookup_uint64(nvprops,
 		    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0)
 			return (SET_ERROR(EINVAL));
 
 		if ((error = nvlist_lookup_uint64(nvprops,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
 		    &volblocksize)) != 0 && error != ENOENT)
 			return (SET_ERROR(EINVAL));
 
 		if (error != 0)
 			volblocksize = zfs_prop_default_numeric(
 			    ZFS_PROP_VOLBLOCKSIZE);
 
 		if ((error = zvol_check_volblocksize(fsname,
 		    volblocksize)) != 0 ||
 		    (error = zvol_check_volsize(volsize,
 		    volblocksize)) != 0)
 			return (error);
 	} else if (type == DMU_OST_ZFS) {
 		int error;
 
 		/*
 		 * We have to have normalization and
 		 * case-folding flags correct when we do the
 		 * file system creation, so go figure them out
 		 * now.
 		 */
 		VERIFY0(nvlist_alloc(&zct.zct_zplprops,
 		    NV_UNIQUE_NAME, KM_SLEEP));
 		error = zfs_fill_zplprops(fsname, nvprops,
 		    zct.zct_zplprops, &is_insensitive);
 		if (error != 0) {
 			nvlist_free(zct.zct_zplprops);
 			return (error);
 		}
 	}
 
 	error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops,
 	    hidden_args, &dcp);
 	if (error != 0) {
 		nvlist_free(zct.zct_zplprops);
 		return (error);
 	}
 
 	error = dmu_objset_create(fsname, type,
 	    is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct);
 
 	nvlist_free(zct.zct_zplprops);
 	dsl_crypto_params_free(dcp, !!error);
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
 		    nvprops, outnvl);
 		if (error != 0) {
 			spa_t *spa;
 			int error2;
 
 			/*
 			 * Volumes will return EBUSY and cannot be destroyed
 			 * until all asynchronous minor handling (e.g. from
 			 * setting the volmode property) has completed. Wait for
 			 * the spa_zvol_taskq to drain then retry.
 			 */
 			error2 = dsl_destroy_head(fsname);
 			while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) {
 				error2 = spa_open(fsname, &spa, FTAG);
 				if (error2 == 0) {
 					taskq_wait(spa->spa_zvol_taskq);
 					spa_close(spa, FTAG);
 				}
 				error2 = dsl_destroy_head(fsname);
 			}
 		}
 	}
 	return (error);
 }
 
 /*
  * innvl: {
  *     "origin" -> name of origin snapshot
  *     (optional) "props" -> { prop -> value }
  *     (optional) "hidden_args" -> { "wkeydata" -> value }
  *         raw uint8_t array of encryption wrapping key data (32 bytes)
  * }
  *
  * outputs:
  * outnvl: propname -> error code (int32)
  */
 static const zfs_ioc_key_t zfs_keys_clone[] = {
 	{"origin",	DATA_TYPE_STRING,	0},
 	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error = 0;
 	nvlist_t *nvprops = NULL;
 	const char *origin_name;
 
 	origin_name = fnvlist_lookup_string(innvl, "origin");
 	(void) nvlist_lookup_nvlist(innvl, "props", &nvprops);
 
 	if (strchr(fsname, '@') ||
 	    strchr(fsname, '%'))
 		return (SET_ERROR(EINVAL));
 
 	if (dataset_namecheck(origin_name, NULL, NULL) != 0)
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_dataset_clone(fsname, origin_name);
 
 	/*
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
 		error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL,
 		    nvprops, outnvl);
 		if (error != 0)
 			(void) dsl_destroy_head(fsname);
 	}
 	return (error);
 }
 
 static const zfs_ioc_key_t zfs_keys_remap[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	/* This IOCTL is no longer supported. */
 	(void) fsname, (void) innvl, (void) outnvl;
 	return (0);
 }
 
 /*
  * innvl: {
  *     "snaps" -> { snapshot1, snapshot2 }
  *     (optional) "props" -> { prop -> value (string) }
  * }
  *
  * outnvl: snapshot -> error code (int32)
  */
 static const zfs_ioc_key_t zfs_keys_snapshot[] = {
 	{"snaps",	DATA_TYPE_NVLIST,	0},
 	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	nvlist_t *snaps;
 	nvlist_t *props = NULL;
 	int error, poollen;
 	nvpair_t *pair;
 
 	(void) nvlist_lookup_nvlist(innvl, "props", &props);
 	if (!nvlist_empty(props) &&
 	    zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS))
 		return (SET_ERROR(ENOTSUP));
 	if ((error = zfs_check_userprops(props)) != 0)
 		return (error);
 
 	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
 		char *cp = strchr(name, '@');
 
 		/*
 		 * The snap name must contain an @, and the part after it must
 		 * contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * The snap must be in the specified pool.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
 		/*
 		 * Check for permission to set the properties on the fs.
 		 */
 		if (!nvlist_empty(props)) {
 			*cp = '\0';
 			error = zfs_secpolicy_write_perms(name,
 			    ZFS_DELEG_PERM_USERPROP, CRED());
 			*cp = '@';
 			if (error != 0)
 				return (error);
 		}
 
 		/* This must be the only snap of this fs. */
 		for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair);
 		    pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) {
 			if (strncmp(name, nvpair_name(pair2), cp - name + 1)
 			    == 0) {
 				return (SET_ERROR(EXDEV));
 			}
 		}
 	}
 
 	error = dsl_dataset_snapshot(snaps, props, outnvl);
 
 	return (error);
 }
 
 /*
  * innvl: "message" -> string
  */
 static const zfs_ioc_key_t zfs_keys_log_history[] = {
 	{"message",	DATA_TYPE_STRING,	0},
 };
 
 static int
 zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) unused, (void) outnvl;
 	const char *message;
 	char *poolname;
 	spa_t *spa;
 	int error;
 
 	/*
 	 * The poolname in the ioctl is not set, we get it from the TSD,
 	 * which was set at the end of the last successful ioctl that allows
 	 * logging.  The secpolicy func already checked that it is set.
 	 * Only one log ioctl is allowed after each successful ioctl, so
 	 * we clear the TSD here.
 	 */
 	poolname = tsd_get(zfs_allow_log_key);
 	if (poolname == NULL)
 		return (SET_ERROR(EINVAL));
 	(void) tsd_set(zfs_allow_log_key, NULL);
 	error = spa_open(poolname, &spa, FTAG);
 	kmem_strfree(poolname);
 	if (error != 0)
 		return (error);
 
 	message = fnvlist_lookup_string(innvl, "message");
 
 	if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = spa_history_log(spa, message);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * This ioctl is used to set the bootenv configuration on the current
  * pool. This configuration is stored in the second padding area of the label,
  * and it is used by the bootloader(s) to store the bootloader and/or system
  * specific data.
  * The data is stored as nvlist data stream, and is protected by
  * an embedded checksum.
  * The version can have two possible values:
  * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING.
  * VB_NVLIST: nvlist with arbitrary <key, value> pairs.
  */
 static const zfs_ioc_key_t zfs_keys_set_bootenv[] = {
 	{"version",	DATA_TYPE_UINT64,	0},
 	{"<keys>",	DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST},
 };
 
 static int
 zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error;
 	spa_t *spa;
 
 	if ((error = spa_open(name, &spa, FTAG)) != 0)
 		return (error);
 	spa_vdev_state_enter(spa, SCL_ALL);
 	error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static const zfs_ioc_key_t zfs_keys_get_bootenv[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(name, &spa, FTAG)) != 0)
 		return (error);
 	spa_vdev_state_enter(spa, SCL_ALL);
 	error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl);
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 /*
  * The dp_config_rwlock must not be held when calling this, because the
  * unmount may need to write out data.
  *
  * This function is best-effort.  Callers must deal gracefully if it
  * remains mounted (or is remounted after this call).
  *
  * Returns 0 if the argument is not a snapshot, or it is not currently a
  * filesystem, or we were able to unmount it.  Returns error code otherwise.
  */
 void
 zfs_unmount_snap(const char *snapname)
 {
 	if (strchr(snapname, '@') == NULL)
 		return;
 
 	(void) zfsctl_snapshot_unmount(snapname, MNT_FORCE);
 }
 
 static int
 zfs_unmount_snap_cb(const char *snapname, void *arg)
 {
 	(void) arg;
 	zfs_unmount_snap(snapname);
 	return (0);
 }
 
 /*
  * When a clone is destroyed, its origin may also need to be destroyed,
  * in which case it must be unmounted.  This routine will do that unmount
  * if necessary.
  */
 void
 zfs_destroy_unmount_origin(const char *fsname)
 {
 	int error;
 	objset_t *os;
 	dsl_dataset_t *ds;
 
 	error = dmu_objset_hold(fsname, FTAG, &os);
 	if (error != 0)
 		return;
 	ds = dmu_objset_ds(os);
 	if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
 		char originname[ZFS_MAX_DATASET_NAME_LEN];
 		dsl_dataset_name(ds->ds_prev, originname);
 		dmu_objset_rele(os, FTAG);
 		zfs_unmount_snap(originname);
 	} else {
 		dmu_objset_rele(os, FTAG);
 	}
 }
 
 /*
  * innvl: {
  *     "snaps" -> { snapshot1, snapshot2 }
  *     (optional boolean) "defer"
  * }
  *
  * outnvl: snapshot -> error code (int32)
  */
 static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = {
 	{"snaps",	DATA_TYPE_NVLIST,	0},
 	{"defer",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int poollen;
 	nvlist_t *snaps;
 	nvpair_t *pair;
 	boolean_t defer;
 	spa_t *spa;
 
 	snaps = fnvlist_lookup_nvlist(innvl, "snaps");
 	defer = nvlist_exists(innvl, "defer");
 
 	poollen = strlen(poolname);
 	for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(snaps, pair)) {
 		const char *name = nvpair_name(pair);
 
 		/*
 		 * The snap must be in the specified pool to prevent the
 		 * invalid removal of zvol minors below.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '@'))
 			return (SET_ERROR(EXDEV));
 
 		zfs_unmount_snap(nvpair_name(pair));
 		if (spa_open(name, &spa, FTAG) == 0) {
 			zvol_remove_minors(spa, name, B_TRUE);
 			spa_close(spa, FTAG);
 		}
 	}
 
 	return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
 }
 
 /*
  * Create bookmarks. The bookmark names are of the form <fs>#<bmark>.
  * All bookmarks and snapshots must be in the same pool.
  * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail.
  *
  * innvl: {
  *     new_bookmark1 -> existing_snapshot,
  *     new_bookmark2 -> existing_bookmark,
  * }
  *
  * outnvl: bookmark -> error code (int32)
  *
  */
 static const zfs_ioc_key_t zfs_keys_bookmark[] = {
 	{"<bookmark>...",	DATA_TYPE_STRING,	ZK_WILDCARDLIST},
 };
 
 static int
 zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) poolname;
 	return (dsl_bookmark_create(innvl, outnvl));
 }
 
 /*
  * innvl: {
  *     property 1, property 2, ...
  * }
  *
  * outnvl: {
  *     bookmark name 1 -> { property 1, property 2, ... },
  *     bookmark name 2 -> { property 1, property 2, ... }
  * }
  *
  */
 static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = {
 	{"<property>...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	return (dsl_get_bookmarks(fsname, innvl, outnvl));
 }
 
 /*
  * innvl is not used.
  *
  * outnvl: {
  *     property 1, property 2, ...
  * }
  *
  */
 static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
 	(void) innvl;
 	char fsname[ZFS_MAX_DATASET_NAME_LEN];
 	char *bmname;
 
 	bmname = strchr(bookmark, '#');
 	if (bmname == NULL)
 		return (SET_ERROR(EINVAL));
 	bmname++;
 
 	(void) strlcpy(fsname, bookmark, sizeof (fsname));
 	*(strchr(fsname, '#')) = '\0';
 
 	return (dsl_get_bookmark_props(fsname, bmname, outnvl));
 }
 
 /*
  * innvl: {
  *     bookmark name 1, bookmark name 2
  * }
  *
  * outnvl: bookmark -> error code (int32)
  *
  */
 static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = {
 	{"<bookmark>...",	DATA_TYPE_BOOLEAN,	ZK_WILDCARDLIST},
 };
 
 static int
 zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
 	int error, poollen;
 
 	poollen = strlen(poolname);
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		const char *name = nvpair_name(pair);
 		const char *cp = strchr(name, '#');
 
 		/*
 		 * The bookmark name must contain an #, and the part after it
 		 * must contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 
 		/*
 		 * The bookmark must be in the specified pool.
 		 */
 		if (strncmp(name, poolname, poollen) != 0 ||
 		    (name[poollen] != '/' && name[poollen] != '#'))
 			return (SET_ERROR(EXDEV));
 	}
 
 	error = dsl_bookmark_destroy(innvl, outnvl);
 	return (error);
 }
 
 static const zfs_ioc_key_t zfs_keys_channel_program[] = {
 	{"program",	DATA_TYPE_STRING,		0},
 	{"arg",		DATA_TYPE_ANY,			0},
 	{"sync",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
 	{"instrlimit",	DATA_TYPE_UINT64,		ZK_OPTIONAL},
 	{"memlimit",	DATA_TYPE_UINT64,		ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
 	const char *program;
 	uint64_t instrlimit, memlimit;
 	boolean_t sync_flag;
 	nvpair_t *nvarg = NULL;
 
 	program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM);
 	if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) {
 		sync_flag = B_TRUE;
 	}
 	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) {
 		instrlimit = ZCP_DEFAULT_INSTRLIMIT;
 	}
 	if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) {
 		memlimit = ZCP_DEFAULT_MEMLIMIT;
 	}
 	nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST);
 
 	if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit)
 		return (SET_ERROR(EINVAL));
 	if (memlimit == 0 || memlimit > zfs_lua_max_memlimit)
 		return (SET_ERROR(EINVAL));
 
 	return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit,
 	    nvarg, outnvl));
 }
 
 /*
  * innvl: unused
  * outnvl: empty
  */
 static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) innvl, (void) outnvl;
 	return (spa_checkpoint(poolname));
 }
 
 /*
  * innvl: unused
  * outnvl: empty
  */
 static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl,
     nvlist_t *outnvl)
 {
 	(void) innvl, (void) outnvl;
 	return (spa_checkpoint_discard(poolname));
 }
 
 /*
  * Loads specific types of data for the given pool
  *
  * innvl: {
  *     "prefetch_type" -> int32_t
  * }
  *
  * outnvl: empty
  */
 static const zfs_ioc_key_t zfs_keys_pool_prefetch[] = {
 	{ZPOOL_PREFETCH_TYPE,	DATA_TYPE_INT32,	0},
 };
 
 static int
 zfs_ioc_pool_prefetch(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) outnvl;
 
 	int error;
 	spa_t *spa;
 	int32_t type;
 
 	/*
 	 * Currently, only ZPOOL_PREFETCH_DDT is supported
 	 */
 	if (nvlist_lookup_int32(innvl, ZPOOL_PREFETCH_TYPE, &type) != 0 ||
 	    type != ZPOOL_PREFETCH_DDT) {
 		return (EINVAL);
 	}
 
 	error = spa_open(poolname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	hrtime_t start_time = gethrtime();
 
 	ddt_prefetch_all(spa);
 
 	zfs_dbgmsg("pool '%s': loaded ddt into ARC in %llu ms", spa->spa_name,
 	    (u_longlong_t)NSEC2MSEC(gethrtime() - start_time));
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of dataset to destroy
  * zc_defer_destroy	mark for deferred destroy
  *
  * outputs:		none
  */
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	dmu_objset_type_t ost;
 	int err;
 
 	err = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (err != 0)
 		return (err);
 	ost = dmu_objset_type(os);
 	dmu_objset_rele(os, FTAG);
 
 	if (ost == DMU_OST_ZFS)
 		zfs_unmount_snap(zc->zc_name);
 
 	if (strchr(zc->zc_name, '@')) {
 		err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
 	} else {
 		err = dsl_destroy_head(zc->zc_name);
 		if (err == EEXIST) {
 			/*
 			 * It is possible that the given DS may have
 			 * hidden child (%recv) datasets - "leftovers"
 			 * resulting from the previously interrupted
 			 * 'zfs receive'.
 			 *
 			 * 6 extra bytes for /%recv
 			 */
 			char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6];
 
 			if (snprintf(namebuf, sizeof (namebuf), "%s/%s",
 			    zc->zc_name, recv_clone_name) >=
 			    sizeof (namebuf))
 				return (SET_ERROR(EINVAL));
 
 			/*
 			 * Try to remove the hidden child (%recv) and after
 			 * that try to remove the target dataset.
 			 * If the hidden child (%recv) does not exist
 			 * the original error (EEXIST) will be returned
 			 */
 			err = dsl_destroy_head(namebuf);
 			if (err == 0)
 				err = dsl_destroy_head(zc->zc_name);
 			else if (err == ENOENT)
 				err = SET_ERROR(EEXIST);
 		}
 	}
 
 	return (err);
 }
 
 /*
  * innvl: {
  *     "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64)
  *     "initialize_vdevs": { -> guids to initialize (nvlist)
  *         "vdev_path_1": vdev_guid_1, (uint64),
  *         "vdev_path_2": vdev_guid_2, (uint64),
  *         ...
  *     },
  * }
  *
  * outnvl: {
  *     "initialize_vdevs": { -> initialization errors (nvlist)
  *         "vdev_path_1": errno, see function body for possible errnos (uint64)
  *         "vdev_path_2": errno, ... (uint64)
  *         ...
  *     }
  * }
  *
  * EINVAL is returned for an unknown commands or if any of the provided vdev
  * guids have be specified with a type other than uint64.
  */
 static const zfs_ioc_key_t zfs_keys_pool_initialize[] = {
 	{ZPOOL_INITIALIZE_COMMAND,	DATA_TYPE_UINT64,	0},
 	{ZPOOL_INITIALIZE_VDEVS,	DATA_TYPE_NVLIST,	0}
 };
 
 static int
 zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	uint64_t cmd_type;
 	if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND,
 	    &cmd_type) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (!(cmd_type == POOL_INITIALIZE_CANCEL ||
 	    cmd_type == POOL_INITIALIZE_START ||
 	    cmd_type == POOL_INITIALIZE_SUSPEND ||
 	    cmd_type == POOL_INITIALIZE_UNINIT)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	nvlist_t *vdev_guids;
 	if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS,
 	    &vdev_guids) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
 		uint64_t vdev_guid;
 		if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	spa_t *spa;
 	int error = spa_open(poolname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	nvlist_t *vdev_errlist = fnvlist_alloc();
 	int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type,
 	    vdev_errlist);
 
 	if (fnvlist_size(vdev_errlist) > 0) {
 		fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS,
 		    vdev_errlist);
 	}
 	fnvlist_free(vdev_errlist);
 
 	spa_close(spa, FTAG);
 	return (total_errors > 0 ? SET_ERROR(EINVAL) : 0);
 }
 
 /*
  * innvl: {
  *     "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64)
  *     "trim_vdevs": { -> guids to TRIM (nvlist)
  *         "vdev_path_1": vdev_guid_1, (uint64),
  *         "vdev_path_2": vdev_guid_2, (uint64),
  *         ...
  *     },
  *     "trim_rate" -> Target TRIM rate in bytes/sec.
  *     "trim_secure" -> Set to request a secure TRIM.
  * }
  *
  * outnvl: {
  *     "trim_vdevs": { -> TRIM errors (nvlist)
  *         "vdev_path_1": errno, see function body for possible errnos (uint64)
  *         "vdev_path_2": errno, ... (uint64)
  *         ...
  *     }
  * }
  *
  * EINVAL is returned for an unknown commands or if any of the provided vdev
  * guids have be specified with a type other than uint64.
  */
 static const zfs_ioc_key_t zfs_keys_pool_trim[] = {
 	{ZPOOL_TRIM_COMMAND,	DATA_TYPE_UINT64,		0},
 	{ZPOOL_TRIM_VDEVS,	DATA_TYPE_NVLIST,		0},
 	{ZPOOL_TRIM_RATE,	DATA_TYPE_UINT64,		ZK_OPTIONAL},
 	{ZPOOL_TRIM_SECURE,	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	uint64_t cmd_type;
 	if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (!(cmd_type == POOL_TRIM_CANCEL ||
 	    cmd_type == POOL_TRIM_START ||
 	    cmd_type == POOL_TRIM_SUSPEND)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	nvlist_t *vdev_guids;
 	if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0)
 		return (SET_ERROR(EINVAL));
 
 	for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) {
 		uint64_t vdev_guid;
 		if (nvpair_value_uint64(pair, &vdev_guid) != 0) {
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	/* Optional, defaults to maximum rate when not provided */
 	uint64_t rate;
 	if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0)
 		rate = 0;
 
 	/* Optional, defaults to standard TRIM when not provided */
 	boolean_t secure;
 	if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE,
 	    &secure) != 0) {
 		secure = B_FALSE;
 	}
 
 	spa_t *spa;
 	int error = spa_open(poolname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	nvlist_t *vdev_errlist = fnvlist_alloc();
 	int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type,
 	    rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist);
 
 	if (fnvlist_size(vdev_errlist) > 0)
 		fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist);
 
 	fnvlist_free(vdev_errlist);
 
 	spa_close(spa, FTAG);
 	return (total_errors > 0 ? SET_ERROR(EINVAL) : 0);
 }
 
 #define	DDT_PRUNE_UNIT		"ddt_prune_unit"
 #define	DDT_PRUNE_AMOUNT	"ddt_prune_amount"
 
 /*
  * innvl: {
  *     "ddt_prune_unit" -> uint32_t
  *     "ddt_prune_amount" -> uint64_t
  * }
  *
  * outnvl: "waited" -> boolean_t
  */
 static const zfs_ioc_key_t zfs_keys_ddt_prune[] = {
 	{DDT_PRUNE_UNIT,	DATA_TYPE_INT32,	0},
 	{DDT_PRUNE_AMOUNT,	DATA_TYPE_UINT64,	0},
 };
 
 static int
 zfs_ioc_ddt_prune(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int32_t unit;
 	uint64_t amount;
 
 	if (nvlist_lookup_int32(innvl, DDT_PRUNE_UNIT, &unit) != 0 ||
 	    nvlist_lookup_uint64(innvl, DDT_PRUNE_AMOUNT, &amount) != 0) {
 		return (EINVAL);
 	}
 
 	spa_t *spa;
 	int error = spa_open(poolname, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	if (!spa_feature_is_enabled(spa, SPA_FEATURE_FAST_DEDUP)) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	error = ddt_prune_unique_entries(spa, (zpool_ddt_prune_unit_t)unit,
 	    amount);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * This ioctl waits for activity of a particular type to complete. If there is
  * no activity of that type in progress, it returns immediately, and the
  * returned value "waited" is false. If there is activity in progress, and no
  * tag is passed in, the ioctl blocks until all activity of that type is
  * complete, and then returns with "waited" set to true.
  *
  * If a tag is provided, it identifies a particular instance of an activity to
  * wait for. Currently, this is only valid for use with 'initialize', because
  * that is the only activity for which there can be multiple instances running
  * concurrently. In the case of 'initialize', the tag corresponds to the guid of
  * the vdev on which to wait.
  *
  * If a thread waiting in the ioctl receives a signal, the call will return
  * immediately, and the return value will be EINTR.
  *
  * innvl: {
  *     "wait_activity" -> int32_t
  *     (optional) "wait_tag" -> uint64_t
  * }
  *
  * outnvl: "waited" -> boolean_t
  */
 static const zfs_ioc_key_t zfs_keys_pool_wait[] = {
 	{ZPOOL_WAIT_ACTIVITY,	DATA_TYPE_INT32,		0},
 	{ZPOOL_WAIT_TAG,	DATA_TYPE_UINT64,		ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int32_t activity;
 	uint64_t tag;
 	boolean_t waited;
 	int error;
 
 	if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0)
 		return (EINVAL);
 
 	if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0)
 		error = spa_wait_tag(name, activity, tag, &waited);
 	else
 		error = spa_wait(name, activity, &waited);
 
 	if (error == 0)
 		fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited);
 
 	return (error);
 }
 
 /*
  * This ioctl waits for activity of a particular type to complete. If there is
  * no activity of that type in progress, it returns immediately, and the
  * returned value "waited" is false. If there is activity in progress, and no
  * tag is passed in, the ioctl blocks until all activity of that type is
  * complete, and then returns with "waited" set to true.
  *
  * If a thread waiting in the ioctl receives a signal, the call will return
  * immediately, and the return value will be EINTR.
  *
  * innvl: {
  *     "wait_activity" -> int32_t
  * }
  *
  * outnvl: "waited" -> boolean_t
  */
 static const zfs_ioc_key_t zfs_keys_fs_wait[] = {
 	{ZFS_WAIT_ACTIVITY,	DATA_TYPE_INT32,		0},
 };
 
 static int
 zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int32_t activity;
 	boolean_t waited = B_FALSE;
 	int error;
 	dsl_pool_t *dp;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 
 	if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0)
 		return (SET_ERROR(EINVAL));
 
 	if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0)
 		return (SET_ERROR(EINVAL));
 
 	if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0)
 		return (error);
 
 	if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	dd = ds->ds_dir;
 	mutex_enter(&dd->dd_activity_lock);
 	dd->dd_activity_waiters++;
 
 	/*
 	 * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t
 	 * aren't evicted while we're waiting. Normally this is prevented by
 	 * holding the pool, but we can't do that while we're waiting since
 	 * that would prevent TXGs from syncing out. Some of the functionality
 	 * of long-holds (e.g. preventing deletion) is unnecessary for this
 	 * case, since we would cancel the waiters before proceeding with a
 	 * deletion. An alternative mechanism for keeping the dataset around
 	 * could be developed but this is simpler.
 	 */
 	dsl_dataset_long_hold(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
 	error = dsl_dir_wait(dd, ds, activity, &waited);
 
 	dsl_dataset_long_rele(ds, FTAG);
 	dd->dd_activity_waiters--;
 	if (dd->dd_activity_waiters == 0)
 		cv_signal(&dd->dd_activity_cv);
 	mutex_exit(&dd->dd_activity_lock);
 
 	dsl_dataset_rele(ds, FTAG);
 
 	if (error == 0)
 		fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited);
 
 	return (error);
 }
 
 /*
  * fsname is name of dataset to rollback (to most recent snapshot)
  *
  * innvl may contain name of expected target snapshot
  *
  * outnvl: "target" -> name of most recent snapshot
  * }
  */
 static const zfs_ioc_key_t zfs_keys_rollback[] = {
 	{"target",	DATA_TYPE_STRING,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	zfsvfs_t *zfsvfs;
 	zvol_state_handle_t *zv;
 	const char *target = NULL;
 	int error;
 
 	(void) nvlist_lookup_string(innvl, "target", &target);
 	if (target != NULL) {
 		const char *cp = strchr(target, '@');
 
 		/*
 		 * The snap name must contain an @, and the part after it must
 		 * contain only valid characters.
 		 */
 		if (cp == NULL ||
 		    zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (getzfsvfs(fsname, &zfsvfs) == 0) {
 		dsl_dataset_t *ds;
 
 		ds = dmu_objset_ds(zfsvfs->z_os);
 		error = zfs_suspend_fs(zfsvfs);
 		if (error == 0) {
 			int resume_err;
 
 			error = dsl_dataset_rollback(fsname, target, zfsvfs,
 			    outnvl);
 			resume_err = zfs_resume_fs(zfsvfs, ds);
 			error = error ? error : resume_err;
 		}
 		zfs_vfs_rele(zfsvfs);
-	} else if ((zv = zvol_suspend(fsname)) != NULL) {
+	} else if (zvol_suspend(fsname, &zv) == 0) {
 		error = dsl_dataset_rollback(fsname, target, zvol_tag(zv),
 		    outnvl);
 		zvol_resume(zv);
 	} else {
 		error = dsl_dataset_rollback(fsname, target, NULL, outnvl);
 	}
 	return (error);
 }
 
 static int
 recursive_unmount(const char *fsname, void *arg)
 {
 	const char *snapname = arg;
 	char *fullname;
 
 	fullname = kmem_asprintf("%s@%s", fsname, snapname);
 	zfs_unmount_snap(fullname);
 	kmem_strfree(fullname);
 
 	return (0);
 }
 
 /*
  *
  * snapname is the snapshot to redact.
  * innvl: {
  *     "bookname" -> (string)
  *         shortname of the redaction bookmark to generate
  *     "snapnv" -> (nvlist, values ignored)
  *         snapshots to redact snapname with respect to
  * }
  *
  * outnvl is unused
  */
 
 static const zfs_ioc_key_t zfs_keys_redact[] = {
 	{"bookname",		DATA_TYPE_STRING,	0},
 	{"snapnv",		DATA_TYPE_NVLIST,	0},
 };
 
 static int
 zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) outnvl;
 	nvlist_t *redactnvl = NULL;
 	const char *redactbook = NULL;
 
 	if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0)
 		return (SET_ERROR(EINVAL));
 	if (fnvlist_num_pairs(redactnvl) == 0)
 		return (SET_ERROR(ENXIO));
 	if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0)
 		return (SET_ERROR(EINVAL));
 
 	return (dmu_redact_snap(snapname, redactnvl, redactbook));
 }
 
 /*
  * inputs:
  * zc_name	old name of dataset
  * zc_value	new name of dataset
  * zc_cookie	recursive flag (only valid for snapshots)
  *
  * outputs:	none
  */
 static int
 zfs_ioc_rename(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	dmu_objset_type_t ost;
 	boolean_t recursive = zc->zc_cookie & 1;
 	boolean_t nounmount = !!(zc->zc_cookie & 2);
 	char *at;
 	int err;
 
 	/* "zfs rename" from and to ...%recv datasets should both fail */
 	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 	zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
 	    dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%'))
 		return (SET_ERROR(EINVAL));
 
 	err = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (err != 0)
 		return (err);
 	ost = dmu_objset_type(os);
 	dmu_objset_rele(os, FTAG);
 
 	at = strchr(zc->zc_name, '@');
 	if (at != NULL) {
 		/* snaps must be in same fs */
 		int error;
 
 		if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
 			return (SET_ERROR(EXDEV));
 		*at = '\0';
 		if (ost == DMU_OST_ZFS && !nounmount) {
 			error = dmu_objset_find(zc->zc_name,
 			    recursive_unmount, at + 1,
 			    recursive ? DS_FIND_CHILDREN : 0);
 			if (error != 0) {
 				*at = '@';
 				return (error);
 			}
 		}
 		error = dsl_dataset_rename_snapshot(zc->zc_name,
 		    at + 1, strchr(zc->zc_value, '@') + 1, recursive);
 		*at = '@';
 
 		return (error);
 	} else {
 		return (dsl_dir_rename(zc->zc_name, zc->zc_value));
 	}
 }
 
 static int
 zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 {
 	const char *propname = nvpair_name(pair);
 	boolean_t issnap = (strchr(dsname, '@') != NULL);
 	zfs_prop_t prop = zfs_name_to_prop(propname);
 	uint64_t intval, compval;
 	int err;
 
 	if (prop == ZPROP_USERPROP) {
 		if (zfs_prop_user(propname)) {
 			if ((err = zfs_secpolicy_write_perms(dsname,
 			    ZFS_DELEG_PERM_USERPROP, cr)))
 				return (err);
 			return (0);
 		}
 
 		if (!issnap && zfs_prop_userquota(propname)) {
 			const char *perm = NULL;
 			const char *uq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
 			const char *gq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
 			const char *uiq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA];
 			const char *giq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA];
 			const char *pq_prefix =
 			    zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA];
 			const char *piq_prefix = zfs_userquota_prop_prefixes[\
 			    ZFS_PROP_PROJECTOBJQUOTA];
 
 			if (strncmp(propname, uq_prefix,
 			    strlen(uq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_USERQUOTA;
 			} else if (strncmp(propname, uiq_prefix,
 			    strlen(uiq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_USEROBJQUOTA;
 			} else if (strncmp(propname, gq_prefix,
 			    strlen(gq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_GROUPQUOTA;
 			} else if (strncmp(propname, giq_prefix,
 			    strlen(giq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_GROUPOBJQUOTA;
 			} else if (strncmp(propname, pq_prefix,
 			    strlen(pq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_PROJECTQUOTA;
 			} else if (strncmp(propname, piq_prefix,
 			    strlen(piq_prefix)) == 0) {
 				perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA;
 			} else {
 				/* {USER|GROUP|PROJECT}USED are read-only */
 				return (SET_ERROR(EINVAL));
 			}
 
 			if ((err = zfs_secpolicy_write_perms(dsname, perm, cr)))
 				return (err);
 			return (0);
 		}
 
 		return (SET_ERROR(EINVAL));
 	}
 
 	if (issnap)
 		return (SET_ERROR(EINVAL));
 
 	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
 		/*
 		 * dsl_prop_get_all_impl() returns properties in this
 		 * format.
 		 */
 		nvlist_t *attrs;
 		VERIFY0(nvpair_value_nvlist(pair, &attrs));
 		VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair));
 	}
 
 	/*
 	 * Check that this value is valid for this pool version
 	 */
 	switch (prop) {
 	case ZFS_PROP_COMPRESSION:
 		/*
 		 * If the user specified gzip compression, make sure
 		 * the SPA supports it. We ignore any errors here since
 		 * we'll catch them later.
 		 */
 		if (nvpair_value_uint64(pair, &intval) == 0) {
 			compval = ZIO_COMPRESS_ALGO(intval);
 			if (compval >= ZIO_COMPRESS_GZIP_1 &&
 			    compval <= ZIO_COMPRESS_GZIP_9 &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_GZIP_COMPRESSION)) {
 				return (SET_ERROR(ENOTSUP));
 			}
 
 			if (compval == ZIO_COMPRESS_ZLE &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_ZLE_COMPRESSION))
 				return (SET_ERROR(ENOTSUP));
 
 			if (compval == ZIO_COMPRESS_LZ4) {
 				spa_t *spa;
 
 				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 					return (err);
 
 				if (!spa_feature_is_enabled(spa,
 				    SPA_FEATURE_LZ4_COMPRESS)) {
 					spa_close(spa, FTAG);
 					return (SET_ERROR(ENOTSUP));
 				}
 				spa_close(spa, FTAG);
 			}
 
 			if (compval == ZIO_COMPRESS_ZSTD) {
 				spa_t *spa;
 
 				if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 					return (err);
 
 				if (!spa_feature_is_enabled(spa,
 				    SPA_FEATURE_ZSTD_COMPRESS)) {
 					spa_close(spa, FTAG);
 					return (SET_ERROR(ENOTSUP));
 				}
 				spa_close(spa, FTAG);
 			}
 		}
 		break;
 
 	case ZFS_PROP_COPIES:
 		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_VOLBLOCKSIZE:
 	case ZFS_PROP_RECORDSIZE:
 		/* Record sizes above 128k need the feature to be enabled */
 		if (nvpair_value_uint64(pair, &intval) == 0 &&
 		    intval > SPA_OLD_MAXBLOCKSIZE) {
 			spa_t *spa;
 
 			/*
 			 * We don't allow setting the property above 1MB,
 			 * unless the tunable has been changed.
 			 */
 			if (intval > zfs_max_recordsize ||
 			    intval > SPA_MAXBLOCKSIZE)
 				return (SET_ERROR(ERANGE));
 
 			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 				return (err);
 
 			if (!spa_feature_is_enabled(spa,
 			    SPA_FEATURE_LARGE_BLOCKS)) {
 				spa_close(spa, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 			spa_close(spa, FTAG);
 		}
 		break;
 
 	case ZFS_PROP_DNODESIZE:
 		/* Dnode sizes above 512 need the feature to be enabled */
 		if (nvpair_value_uint64(pair, &intval) == 0 &&
 		    intval != ZFS_DNSIZE_LEGACY) {
 			spa_t *spa;
 
 			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 				return (err);
 
 			if (!spa_feature_is_enabled(spa,
 			    SPA_FEATURE_LARGE_DNODE)) {
 				spa_close(spa, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 			spa_close(spa, FTAG);
 		}
 		break;
 
 	case ZFS_PROP_SHARESMB:
 		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
 			return (SET_ERROR(ENOTSUP));
 		break;
 
 	case ZFS_PROP_ACLINHERIT:
 		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
 		    nvpair_value_uint64(pair, &intval) == 0) {
 			if (intval == ZFS_ACL_PASSTHROUGH_X &&
 			    zfs_earlier_version(dsname,
 			    SPA_VERSION_PASSTHROUGH_X))
 				return (SET_ERROR(ENOTSUP));
 		}
 		break;
 	case ZFS_PROP_CHECKSUM:
 	case ZFS_PROP_DEDUP:
 	{
 		spa_feature_t feature;
 		spa_t *spa;
 		int err;
 
 		/* dedup feature version checks */
 		if (prop == ZFS_PROP_DEDUP &&
 		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
 			return (SET_ERROR(ENOTSUP));
 
 		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
 		    nvpair_value_uint64(pair, &intval) == 0) {
 			/* check prop value is enabled in features */
 			feature = zio_checksum_to_feature(
 			    intval & ZIO_CHECKSUM_MASK);
 			if (feature == SPA_FEATURE_NONE)
 				break;
 
 			if ((err = spa_open(dsname, &spa, FTAG)) != 0)
 				return (err);
 
 			if (!spa_feature_is_enabled(spa, feature)) {
 				spa_close(spa, FTAG);
 				return (SET_ERROR(ENOTSUP));
 			}
 			spa_close(spa, FTAG);
 		}
 		break;
 	}
 
 	default:
 		break;
 	}
 
 	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
 }
 
 /*
  * Removes properties from the given props list that fail permission checks
  * needed to clear them and to restore them in case of a receive error. For each
  * property, make sure we have both set and inherit permissions.
  *
  * Returns the first error encountered if any permission checks fail. If the
  * caller provides a non-NULL errlist, it also gives the complete list of names
  * of all the properties that failed a permission check along with the
  * corresponding error numbers. The caller is responsible for freeing the
  * returned errlist.
  *
  * If every property checks out successfully, zero is returned and the list
  * pointed at by errlist is NULL.
  */
 static int
 zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist)
 {
 	zfs_cmd_t *zc;
 	nvpair_t *pair, *next_pair;
 	nvlist_t *errors;
 	int err, rv = 0;
 
 	if (props == NULL)
 		return (0);
 
 	VERIFY0(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP));
 
 	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
 	(void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name));
 	pair = nvlist_next_nvpair(props, NULL);
 	while (pair != NULL) {
 		next_pair = nvlist_next_nvpair(props, pair);
 
 		(void) strlcpy(zc->zc_value, nvpair_name(pair),
 		    sizeof (zc->zc_value));
 		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
 		    (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) {
 			VERIFY0(nvlist_remove_nvpair(props, pair));
 			VERIFY0(nvlist_add_int32(errors, zc->zc_value, err));
 		}
 		pair = next_pair;
 	}
 	kmem_free(zc, sizeof (zfs_cmd_t));
 
 	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
 		nvlist_free(errors);
 		errors = NULL;
 	} else {
 		VERIFY0(nvpair_value_int32(pair, &rv));
 	}
 
 	if (errlist == NULL)
 		nvlist_free(errors);
 	else
 		*errlist = errors;
 
 	return (rv);
 }
 
 static boolean_t
 propval_equals(nvpair_t *p1, nvpair_t *p2)
 {
 	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
 		/* dsl_prop_get_all_impl() format */
 		nvlist_t *attrs;
 		VERIFY0(nvpair_value_nvlist(p1, &attrs));
 		VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1));
 	}
 
 	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
 		nvlist_t *attrs;
 		VERIFY0(nvpair_value_nvlist(p2, &attrs));
 		VERIFY0(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2));
 	}
 
 	if (nvpair_type(p1) != nvpair_type(p2))
 		return (B_FALSE);
 
 	if (nvpair_type(p1) == DATA_TYPE_STRING) {
 		const char *valstr1, *valstr2;
 
 		VERIFY0(nvpair_value_string(p1, &valstr1));
 		VERIFY0(nvpair_value_string(p2, &valstr2));
 		return (strcmp(valstr1, valstr2) == 0);
 	} else {
 		uint64_t intval1, intval2;
 
 		VERIFY0(nvpair_value_uint64(p1, &intval1));
 		VERIFY0(nvpair_value_uint64(p2, &intval2));
 		return (intval1 == intval2);
 	}
 }
 
 /*
  * Remove properties from props if they are not going to change (as determined
  * by comparison with origprops). Remove them from origprops as well, since we
  * do not need to clear or restore properties that won't change.
  */
 static void
 props_reduce(nvlist_t *props, nvlist_t *origprops)
 {
 	nvpair_t *pair, *next_pair;
 
 	if (origprops == NULL)
 		return; /* all props need to be received */
 
 	pair = nvlist_next_nvpair(props, NULL);
 	while (pair != NULL) {
 		const char *propname = nvpair_name(pair);
 		nvpair_t *match;
 
 		next_pair = nvlist_next_nvpair(props, pair);
 
 		if ((nvlist_lookup_nvpair(origprops, propname,
 		    &match) != 0) || !propval_equals(pair, match))
 			goto next; /* need to set received value */
 
 		/* don't clear the existing received value */
 		(void) nvlist_remove_nvpair(origprops, match);
 		/* don't bother receiving the property */
 		(void) nvlist_remove_nvpair(props, pair);
 next:
 		pair = next_pair;
 	}
 }
 
 /*
  * Extract properties that cannot be set PRIOR to the receipt of a dataset.
  * For example, refquota cannot be set until after the receipt of a dataset,
  * because in replication streams, an older/earlier snapshot may exceed the
  * refquota.  We want to receive the older/earlier snapshot, but setting
  * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
  * the older/earlier snapshot from being received (with EDQUOT).
  *
  * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
  *
  * libzfs will need to be judicious handling errors encountered by props
  * extracted by this function.
  */
 static nvlist_t *
 extract_delay_props(nvlist_t *props)
 {
 	nvlist_t *delayprops;
 	nvpair_t *nvp, *tmp;
 	static const zfs_prop_t delayable[] = {
 		ZFS_PROP_REFQUOTA,
 		ZFS_PROP_KEYLOCATION,
 		/*
 		 * Setting ZFS_PROP_SHARESMB requires the objset type to be
 		 * known, which is not possible prior to receipt of raw sends.
 		 */
 		ZFS_PROP_SHARESMB,
 		0
 	};
 	int i;
 
 	VERIFY0(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP));
 
 	for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
 	    nvp = nvlist_next_nvpair(props, nvp)) {
 		/*
 		 * strcmp() is safe because zfs_prop_to_name() always returns
 		 * a bounded string.
 		 */
 		for (i = 0; delayable[i] != 0; i++) {
 			if (strcmp(zfs_prop_to_name(delayable[i]),
 			    nvpair_name(nvp)) == 0) {
 				break;
 			}
 		}
 		if (delayable[i] != 0) {
 			tmp = nvlist_prev_nvpair(props, nvp);
 			VERIFY0(nvlist_add_nvpair(delayprops, nvp));
 			VERIFY0(nvlist_remove_nvpair(props, nvp));
 			nvp = tmp;
 		}
 	}
 
 	if (nvlist_empty(delayprops)) {
 		nvlist_free(delayprops);
 		delayprops = NULL;
 	}
 	return (delayprops);
 }
 
 static void
 zfs_allow_log_destroy(void *arg)
 {
 	char *poolname = arg;
 
 	if (poolname != NULL)
 		kmem_strfree(poolname);
 }
 
 #ifdef	ZFS_DEBUG
 static boolean_t zfs_ioc_recv_inject_err;
 #endif
 
 /*
  * nvlist 'errors' is always allocated. It will contain descriptions of
  * encountered errors, if any. It's the callers responsibility to free.
  */
 static int
 zfs_ioc_recv_impl(char *tofs, char *tosnap, const char *origin,
     nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args,
     boolean_t force, boolean_t heal, boolean_t resumable, int input_fd,
     dmu_replay_record_t *begin_record, uint64_t *read_bytes,
     uint64_t *errflags, nvlist_t **errors)
 {
 	dmu_recv_cookie_t drc;
 	int error = 0;
 	int props_error = 0;
 	offset_t off, noff;
 	nvlist_t *local_delayprops = NULL;
 	nvlist_t *recv_delayprops = NULL;
 	nvlist_t *inherited_delayprops = NULL;
 	nvlist_t *origprops = NULL; /* existing properties */
 	nvlist_t *origrecvd = NULL; /* existing received properties */
 	boolean_t first_recvd_props = B_FALSE;
 	boolean_t tofs_was_redacted;
 	zfs_file_t *input_fp;
 
 	*read_bytes = 0;
 	*errflags = 0;
 	*errors = fnvlist_alloc();
 	off = 0;
 
 	if ((input_fp = zfs_file_get(input_fd)) == NULL)
 		return (SET_ERROR(EBADF));
 
 	noff = off = zfs_file_off(input_fp);
 	error = dmu_recv_begin(tofs, tosnap, begin_record, force, heal,
 	    resumable, localprops, hidden_args, origin, &drc, input_fp,
 	    &off);
 	if (error != 0)
 		goto out;
 	tofs_was_redacted = dsl_get_redacted(drc.drc_ds);
 
 	/*
 	 * Set properties before we receive the stream so that they are applied
 	 * to the new data. Note that we must call dmu_recv_stream() if
 	 * dmu_recv_begin() succeeds.
 	 */
 	if (recvprops != NULL && !drc.drc_newfs) {
 		if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >=
 		    SPA_VERSION_RECVD_PROPS &&
 		    !dsl_prop_get_hasrecvd(tofs))
 			first_recvd_props = B_TRUE;
 
 		/*
 		 * If new received properties are supplied, they are to
 		 * completely replace the existing received properties,
 		 * so stash away the existing ones.
 		 */
 		if (dsl_prop_get_received(tofs, &origrecvd) == 0) {
 			nvlist_t *errlist = NULL;
 			/*
 			 * Don't bother writing a property if its value won't
 			 * change (and avoid the unnecessary security checks).
 			 *
 			 * The first receive after SPA_VERSION_RECVD_PROPS is a
 			 * special case where we blow away all local properties
 			 * regardless.
 			 */
 			if (!first_recvd_props)
 				props_reduce(recvprops, origrecvd);
 			if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0)
 				(void) nvlist_merge(*errors, errlist, 0);
 			nvlist_free(errlist);
 
 			if (clear_received_props(tofs, origrecvd,
 			    first_recvd_props ? NULL : recvprops) != 0)
 				*errflags |= ZPROP_ERR_NOCLEAR;
 		} else {
 			*errflags |= ZPROP_ERR_NOCLEAR;
 		}
 	}
 
 	/*
 	 * Stash away existing properties so we can restore them on error unless
 	 * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which
 	 * case "origrecvd" will take care of that.
 	 */
 	if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) {
 		objset_t *os;
 		if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
 			if (dsl_prop_get_all(os, &origprops) != 0) {
 				*errflags |= ZPROP_ERR_NOCLEAR;
 			}
 			dmu_objset_rele(os, FTAG);
 		} else {
 			*errflags |= ZPROP_ERR_NOCLEAR;
 		}
 	}
 
 	if (recvprops != NULL) {
 		props_error = dsl_prop_set_hasrecvd(tofs);
 
 		if (props_error == 0) {
 			recv_delayprops = extract_delay_props(recvprops);
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
 			    recvprops, *errors);
 		}
 	}
 
 	if (localprops != NULL) {
 		nvlist_t *oprops = fnvlist_alloc();
 		nvlist_t *xprops = fnvlist_alloc();
 		nvpair_t *nvp = NULL;
 
 		while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) {
 			if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) {
 				/* -x property */
 				const char *name = nvpair_name(nvp);
 				zfs_prop_t prop = zfs_name_to_prop(name);
 				if (prop != ZPROP_USERPROP) {
 					if (!zfs_prop_inheritable(prop))
 						continue;
 				} else if (!zfs_prop_user(name))
 					continue;
 				fnvlist_add_boolean(xprops, name);
 			} else {
 				/* -o property=value */
 				fnvlist_add_nvpair(oprops, nvp);
 			}
 		}
 
 		local_delayprops = extract_delay_props(oprops);
 		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
 		    oprops, *errors);
 		inherited_delayprops = extract_delay_props(xprops);
 		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
 		    xprops, *errors);
 
 		nvlist_free(oprops);
 		nvlist_free(xprops);
 	}
 
 	error = dmu_recv_stream(&drc, &off);
 
 	if (error == 0) {
 		zfsvfs_t *zfsvfs = NULL;
 		zvol_state_handle_t *zv = NULL;
 
 		if (getzfsvfs(tofs, &zfsvfs) == 0) {
 			/* online recv */
 			dsl_dataset_t *ds;
 			int end_err;
 			boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS(
 			    begin_record->drr_u.drr_begin.
 			    drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED;
 
 			ds = dmu_objset_ds(zfsvfs->z_os);
 			error = zfs_suspend_fs(zfsvfs);
 			/*
 			 * If the suspend fails, then the recv_end will
 			 * likely also fail, and clean up after itself.
 			 */
 			end_err = dmu_recv_end(&drc, zfsvfs);
 			/*
 			 * If the dataset was not redacted, but we received a
 			 * redacted stream onto it, we need to unmount the
 			 * dataset.  Otherwise, resume the filesystem.
 			 */
 			if (error == 0 && !drc.drc_newfs &&
 			    stream_is_redacted && !tofs_was_redacted) {
 				error = zfs_end_fs(zfsvfs, ds);
 			} else if (error == 0) {
 				error = zfs_resume_fs(zfsvfs, ds);
 			}
 			error = error ? error : end_err;
 			zfs_vfs_rele(zfsvfs);
-		} else if ((zv = zvol_suspend(tofs)) != NULL) {
+		} else if (zvol_suspend(tofs, &zv) == 0) {
 			error = dmu_recv_end(&drc, zvol_tag(zv));
 			zvol_resume(zv);
 		} else {
 			error = dmu_recv_end(&drc, NULL);
 		}
 
 		/* Set delayed properties now, after we're done receiving. */
 		if (recv_delayprops != NULL && error == 0) {
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
 			    recv_delayprops, *errors);
 		}
 		if (local_delayprops != NULL && error == 0) {
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL,
 			    local_delayprops, *errors);
 		}
 		if (inherited_delayprops != NULL && error == 0) {
 			(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED,
 			    inherited_delayprops, *errors);
 		}
 	}
 
 	/*
 	 * Merge delayed props back in with initial props, in case
 	 * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
 	 * we have to make sure clear_received_props() includes
 	 * the delayed properties).
 	 *
 	 * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
 	 * using ASSERT() will be just like a VERIFY.
 	 */
 	if (recv_delayprops != NULL) {
 		ASSERT0(nvlist_merge(recvprops, recv_delayprops, 0));
 		nvlist_free(recv_delayprops);
 	}
 	if (local_delayprops != NULL) {
 		ASSERT0(nvlist_merge(localprops, local_delayprops, 0));
 		nvlist_free(local_delayprops);
 	}
 	if (inherited_delayprops != NULL) {
 		ASSERT0(nvlist_merge(localprops, inherited_delayprops, 0));
 		nvlist_free(inherited_delayprops);
 	}
 	*read_bytes = off - noff;
 
 #ifdef	ZFS_DEBUG
 	if (zfs_ioc_recv_inject_err) {
 		zfs_ioc_recv_inject_err = B_FALSE;
 		error = 1;
 	}
 #endif
 
 	/*
 	 * On error, restore the original props.
 	 */
 	if (error != 0 && recvprops != NULL && !drc.drc_newfs) {
 		if (clear_received_props(tofs, recvprops, NULL) != 0) {
 			/*
 			 * We failed to clear the received properties.
 			 * Since we may have left a $recvd value on the
 			 * system, we can't clear the $hasrecvd flag.
 			 */
 			*errflags |= ZPROP_ERR_NORESTORE;
 		} else if (first_recvd_props) {
 			dsl_prop_unset_hasrecvd(tofs);
 		}
 
 		if (origrecvd == NULL && !drc.drc_newfs) {
 			/* We failed to stash the original properties. */
 			*errflags |= ZPROP_ERR_NORESTORE;
 		}
 
 		/*
 		 * dsl_props_set() will not convert RECEIVED to LOCAL on or
 		 * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
 		 * explicitly if we're restoring local properties cleared in the
 		 * first new-style receive.
 		 */
 		if (origrecvd != NULL &&
 		    zfs_set_prop_nvlist(tofs, (first_recvd_props ?
 		    ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
 		    origrecvd, NULL) != 0) {
 			/*
 			 * We stashed the original properties but failed to
 			 * restore them.
 			 */
 			*errflags |= ZPROP_ERR_NORESTORE;
 		}
 	}
 	if (error != 0 && localprops != NULL && !drc.drc_newfs &&
 	    !first_recvd_props) {
 		nvlist_t *setprops;
 		nvlist_t *inheritprops;
 		nvpair_t *nvp;
 
 		if (origprops == NULL) {
 			/* We failed to stash the original properties. */
 			*errflags |= ZPROP_ERR_NORESTORE;
 			goto out;
 		}
 
 		/* Restore original props */
 		setprops = fnvlist_alloc();
 		inheritprops = fnvlist_alloc();
 		nvp = NULL;
 		while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) {
 			const char *name = nvpair_name(nvp);
 			const char *source;
 			nvlist_t *attrs;
 
 			if (!nvlist_exists(origprops, name)) {
 				/*
 				 * Property was not present or was explicitly
 				 * inherited before the receive, restore this.
 				 */
 				fnvlist_add_boolean(inheritprops, name);
 				continue;
 			}
 			attrs = fnvlist_lookup_nvlist(origprops, name);
 			source = fnvlist_lookup_string(attrs, ZPROP_SOURCE);
 
 			/* Skip received properties */
 			if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0)
 				continue;
 
 			if (strcmp(source, tofs) == 0) {
 				/* Property was locally set */
 				fnvlist_add_nvlist(setprops, name, attrs);
 			} else {
 				/* Property was implicitly inherited */
 				fnvlist_add_boolean(inheritprops, name);
 			}
 		}
 
 		if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops,
 		    NULL) != 0)
 			*errflags |= ZPROP_ERR_NORESTORE;
 		if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops,
 		    NULL) != 0)
 			*errflags |= ZPROP_ERR_NORESTORE;
 
 		nvlist_free(setprops);
 		nvlist_free(inheritprops);
 	}
 out:
 	zfs_file_put(input_fp);
 	nvlist_free(origrecvd);
 	nvlist_free(origprops);
 
 	if (error == 0)
 		error = props_error;
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of containing filesystem (unused)
  * zc_nvlist_src{_size}	nvlist of properties to apply
  * zc_nvlist_conf{_size}	nvlist of properties to exclude
  *			(DATA_TYPE_BOOLEAN) and override (everything else)
  * zc_value		name of snapshot to create
  * zc_string		name of clone origin (if DRR_FLAG_CLONE)
  * zc_cookie		file descriptor to recv from
  * zc_begin_record	the BEGIN record of the stream (not byteswapped)
  * zc_guid		force flag
  *
  * outputs:
  * zc_cookie		number of bytes read
  * zc_obj		zprop_errflags_t
  * zc_nvlist_dst{_size} error for each unapplied received property
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
 {
 	dmu_replay_record_t begin_record;
 	nvlist_t *errors = NULL;
 	nvlist_t *recvdprops = NULL;
 	nvlist_t *localprops = NULL;
 	const char *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	int error = 0;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
 	    strchr(zc->zc_value, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	(void) strlcpy(tofs, zc->zc_value, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
 	*tosnap++ = '\0';
 
 	if (zc->zc_nvlist_src != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 	    zc->zc_iflags, &recvdprops)) != 0) {
 		goto out;
 	}
 
 	if (zc->zc_nvlist_conf != 0 &&
 	    (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
 	    zc->zc_iflags, &localprops)) != 0) {
 		goto out;
 	}
 
 	if (zc->zc_string[0])
 		origin = zc->zc_string;
 
 	begin_record.drr_type = DRR_BEGIN;
 	begin_record.drr_payloadlen = 0;
 	begin_record.drr_u.drr_begin = zc->zc_begin_record;
 
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops,
 	    NULL, zc->zc_guid, B_FALSE, B_FALSE, zc->zc_cookie, &begin_record,
 	    &zc->zc_cookie, &zc->zc_obj, &errors);
 
 	/*
 	 * Now that all props, initial and delayed, are set, report the prop
 	 * errors to the caller.
 	 */
 	if (zc->zc_nvlist_dst_size != 0 && errors != NULL &&
 	    (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
 	    put_nvlist(zc, errors) != 0)) {
 		/*
 		 * Caller made zc->zc_nvlist_dst less than the minimum expected
 		 * size or supplied an invalid address.
 		 */
 		error = SET_ERROR(EINVAL);
 	}
 
 out:
 	nvlist_free(errors);
 	nvlist_free(recvdprops);
 	nvlist_free(localprops);
 
 	return (error);
 }
 
 /*
  * innvl: {
  *     "snapname" -> full name of the snapshot to create
  *     (optional) "props" -> received properties to set (nvlist)
  *     (optional) "localprops" -> override and exclude properties (nvlist)
  *     (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE)
  *     "begin_record" -> non-byteswapped dmu_replay_record_t
  *     "input_fd" -> file descriptor to read stream from (int32)
  *     (optional) "force" -> force flag (value ignored)
  *     (optional) "heal" -> use send stream to heal data corruption
  *     (optional) "resumable" -> resumable flag (value ignored)
  *     (optional) "cleanup_fd" -> unused
  *     (optional) "action_handle" -> unused
  *     (optional) "hidden_args" -> { "wkeydata" -> value }
  * }
  *
  * outnvl: {
  *     "read_bytes" -> number of bytes read
  *     "error_flags" -> zprop_errflags_t
  *     "errors" -> error for each unapplied received property (nvlist)
  * }
  */
 static const zfs_ioc_key_t zfs_keys_recv_new[] = {
 	{"snapname",		DATA_TYPE_STRING,	0},
 	{"props",		DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 	{"localprops",		DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 	{"origin",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 	{"begin_record",	DATA_TYPE_BYTE_ARRAY,	0},
 	{"input_fd",		DATA_TYPE_INT32,	0},
 	{"force",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"heal",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"resumable",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
 	{"action_handle",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"hidden_args",		DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	dmu_replay_record_t *begin_record;
 	uint_t begin_record_size;
 	nvlist_t *errors = NULL;
 	nvlist_t *recvprops = NULL;
 	nvlist_t *localprops = NULL;
 	nvlist_t *hidden_args = NULL;
 	const char *snapname;
 	const char *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAX_DATASET_NAME_LEN];
 	boolean_t force;
 	boolean_t heal;
 	boolean_t resumable;
 	uint64_t read_bytes = 0;
 	uint64_t errflags = 0;
 	int input_fd = -1;
 	int error;
 
 	snapname = fnvlist_lookup_string(innvl, "snapname");
 
 	if (dataset_namecheck(snapname, NULL, NULL) != 0 ||
 	    strchr(snapname, '@') == NULL ||
 	    strchr(snapname, '%') != NULL) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	(void) strlcpy(tofs, snapname, sizeof (tofs));
 	tosnap = strchr(tofs, '@');
 	*tosnap++ = '\0';
 
 	error = nvlist_lookup_string(innvl, "origin", &origin);
 	if (error && error != ENOENT)
 		return (error);
 
 	error = nvlist_lookup_byte_array(innvl, "begin_record",
 	    (uchar_t **)&begin_record, &begin_record_size);
 	if (error != 0 || begin_record_size != sizeof (*begin_record))
 		return (SET_ERROR(EINVAL));
 
 	input_fd = fnvlist_lookup_int32(innvl, "input_fd");
 
 	force = nvlist_exists(innvl, "force");
 	heal = nvlist_exists(innvl, "heal");
 	resumable = nvlist_exists(innvl, "resumable");
 
 	/* we still use "props" here for backwards compatibility */
 	error = nvlist_lookup_nvlist(innvl, "props", &recvprops);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = nvlist_lookup_nvlist(innvl, "localprops", &localprops);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops,
 	    hidden_args, force, heal, resumable, input_fd, begin_record,
 	    &read_bytes, &errflags, &errors);
 
 	fnvlist_add_uint64(outnvl, "read_bytes", read_bytes);
 	fnvlist_add_uint64(outnvl, "error_flags", errflags);
 	fnvlist_add_nvlist(outnvl, "errors", errors);
 
 out:
 	nvlist_free(errors);
 	nvlist_free(recvprops);
 	nvlist_free(localprops);
 	nvlist_free(hidden_args);
 
 	return (error);
 }
 
 /*
  * When stack space is limited, we write replication stream data to the target
  * on a separate taskq thread, to make sure there's enough stack space.
  */
 #ifndef HAVE_LARGE_STACKS
 #define	USE_SEND_TASKQ	1
 #endif
 
 typedef struct dump_bytes_io {
 	zfs_file_t	*dbi_fp;
 	caddr_t		dbi_buf;
 	int		dbi_len;
 	int		dbi_err;
 } dump_bytes_io_t;
 
 static void
 dump_bytes_cb(void *arg)
 {
 	dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
 	zfs_file_t *fp;
 	caddr_t buf;
 
 	fp = dbi->dbi_fp;
 	buf = dbi->dbi_buf;
 
 	dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL);
 }
 
 typedef struct dump_bytes_arg {
 	zfs_file_t	*dba_fp;
 #ifdef USE_SEND_TASKQ
 	taskq_t		*dba_tq;
 	taskq_ent_t	dba_tqent;
 #endif
 } dump_bytes_arg_t;
 
 static int
 dump_bytes(objset_t *os, void *buf, int len, void *arg)
 {
 	dump_bytes_arg_t *dba = (dump_bytes_arg_t *)arg;
 	dump_bytes_io_t dbi;
 
 	dbi.dbi_fp = dba->dba_fp;
 	dbi.dbi_buf = buf;
 	dbi.dbi_len = len;
 
 #ifdef USE_SEND_TASKQ
 	taskq_dispatch_ent(dba->dba_tq, dump_bytes_cb, &dbi, TQ_SLEEP,
 	    &dba->dba_tqent);
 	taskq_wait(dba->dba_tq);
 #else
 	dump_bytes_cb(&dbi);
 #endif
 
 	return (dbi.dbi_err);
 }
 
 static int
 dump_bytes_init(dump_bytes_arg_t *dba, int fd, dmu_send_outparams_t *out)
 {
 	zfs_file_t *fp = zfs_file_get(fd);
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	dba->dba_fp = fp;
 #ifdef USE_SEND_TASKQ
 	dba->dba_tq = taskq_create("z_send", 1, defclsyspri, 0, 0, 0);
 	taskq_init_ent(&dba->dba_tqent);
 #endif
 
 	memset(out, 0, sizeof (dmu_send_outparams_t));
 	out->dso_outfunc = dump_bytes;
 	out->dso_arg = dba;
 	out->dso_dryrun = B_FALSE;
 
 	return (0);
 }
 
 static void
 dump_bytes_fini(dump_bytes_arg_t *dba)
 {
 	zfs_file_put(dba->dba_fp);
 #ifdef USE_SEND_TASKQ
 	taskq_destroy(dba->dba_tq);
 #endif
 }
 
 /*
  * inputs:
  * zc_name	name of snapshot to send
  * zc_cookie	file descriptor to send stream to
  * zc_obj	fromorigin flag (mutually exclusive with zc_fromobj)
  * zc_sendobj	objsetid of snapshot to send
  * zc_fromobj	objsetid of incremental fromsnap (may be zero)
  * zc_guid	if set, estimate size of stream only.  zc_cookie is ignored.
  *		output size in zc_objset_type.
  * zc_flags	lzc_send_flags
  *
  * outputs:
  * zc_objset_type	estimated size, if zc_guid is set
  *
  * NOTE: This is no longer the preferred interface, any new functionality
  *	  should be added to zfs_ioc_send_new() instead.
  */
 static int
 zfs_ioc_send(zfs_cmd_t *zc)
 {
 	int error;
 	offset_t off;
 	boolean_t estimate = (zc->zc_guid != 0);
 	boolean_t embedok = (zc->zc_flags & 0x1);
 	boolean_t large_block_ok = (zc->zc_flags & 0x2);
 	boolean_t compressok = (zc->zc_flags & 0x4);
 	boolean_t rawok = (zc->zc_flags & 0x8);
 	boolean_t savedok = (zc->zc_flags & 0x10);
 
 	if (zc->zc_obj != 0) {
 		dsl_pool_t *dp;
 		dsl_dataset_t *tosnap;
 
 		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 		if (error != 0)
 			return (error);
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap);
 		if (error != 0) {
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		if (dsl_dir_is_clone(tosnap->ds_dir))
 			zc->zc_fromobj =
 			    dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	}
 
 	if (estimate) {
 		dsl_pool_t *dp;
 		dsl_dataset_t *tosnap;
 		dsl_dataset_t *fromsnap = NULL;
 
 		error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 		if (error != 0)
 			return (error);
 
 		error = dsl_dataset_hold_obj(dp, zc->zc_sendobj,
 		    FTAG, &tosnap);
 		if (error != 0) {
 			dsl_pool_rele(dp, FTAG);
 			return (error);
 		}
 
 		if (zc->zc_fromobj != 0) {
 			error = dsl_dataset_hold_obj(dp, zc->zc_fromobj,
 			    FTAG, &fromsnap);
 			if (error != 0) {
 				dsl_dataset_rele(tosnap, FTAG);
 				dsl_pool_rele(dp, FTAG);
 				return (error);
 			}
 		}
 
 		error = dmu_send_estimate_fast(tosnap, fromsnap, NULL,
 		    compressok || rawok, savedok, &zc->zc_objset_type);
 
 		if (fromsnap != NULL)
 			dsl_dataset_rele(fromsnap, FTAG);
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	} else {
 		dump_bytes_arg_t dba;
 		dmu_send_outparams_t out;
 		error = dump_bytes_init(&dba, zc->zc_cookie, &out);
 		if (error)
 			return (error);
 
 		off = zfs_file_off(dba.dba_fp);
 		error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
 		    zc->zc_fromobj, embedok, large_block_ok, compressok,
 		    rawok, savedok, zc->zc_cookie, &off, &out);
 
 		dump_bytes_fini(&dba);
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of snapshot on which to report progress
  * zc_cookie		file descriptor of send stream
  *
  * outputs:
  * zc_cookie		number of bytes written in send stream thus far
  * zc_objset_type	logical size of data traversed by send thus far
  */
 static int
 zfs_ioc_send_progress(zfs_cmd_t *zc)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	dmu_sendstatus_t *dsp = NULL;
 	int error;
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	mutex_enter(&ds->ds_sendstream_lock);
 
 	/*
 	 * Iterate over all the send streams currently active on this dataset.
 	 * If there's one which matches the specified file descriptor _and_ the
 	 * stream was started by the current process, return the progress of
 	 * that stream.
 	 */
 
 	for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL;
 	    dsp = list_next(&ds->ds_sendstreams, dsp)) {
 		if (dsp->dss_outfd == zc->zc_cookie &&
 		    zfs_proc_is_caller(dsp->dss_proc))
 			break;
 	}
 
 	if (dsp != NULL) {
 		zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off,
 		    0, 0);
 		/* This is the closest thing we have to atomic_read_64. */
 		zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0);
 	} else {
 		error = SET_ERROR(ENOENT);
 	}
 
 	mutex_exit(&ds->ds_sendstream_lock);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 static int
 zfs_ioc_inject_fault(zfs_cmd_t *zc)
 {
 	int id, error;
 
 	error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
 	    &zc->zc_inject_record);
 
 	if (error == 0)
 		zc->zc_guid = (uint64_t)id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear_fault(zfs_cmd_t *zc)
 {
 	return (zio_clear_fault((int)zc->zc_guid));
 }
 
 static int
 zfs_ioc_inject_list_next(zfs_cmd_t *zc)
 {
 	int id = (int)zc->zc_guid;
 	int error;
 
 	error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
 	    &zc->zc_inject_record);
 
 	zc->zc_guid = id;
 
 	return (error);
 }
 
 static int
 zfs_ioc_error_log(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	int error;
 
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
 	error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
 	    &zc->zc_nvlist_dst_size);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_clear(zfs_cmd_t *zc)
 {
 	spa_t *spa;
 	vdev_t *vd;
 	int error;
 
 	/*
 	 * On zpool clear we also fix up missing slogs
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_lookup(zc->zc_name);
 	if (spa == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EIO));
 	}
 	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
 		/* we need to let spa_open/spa_load clear the chains */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	}
 	spa->spa_last_open_failed = 0;
 	mutex_exit(&spa_namespace_lock);
 
 	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
 		error = spa_open(zc->zc_name, &spa, FTAG);
 	} else {
 		nvlist_t *policy;
 		nvlist_t *config = NULL;
 
 		if (zc->zc_nvlist_src == 0)
 			return (SET_ERROR(EINVAL));
 
 		if ((error = get_nvlist(zc->zc_nvlist_src,
 		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
 			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
 			    policy, &config);
 			if (config != NULL) {
 				int err;
 
 				if ((err = put_nvlist(zc, config)) != 0)
 					error = err;
 				nvlist_free(config);
 			}
 			nvlist_free(policy);
 		}
 	}
 
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If multihost is enabled, resuming I/O is unsafe as another
 	 * host may have imported the pool. Check for remote activity.
 	 */
 	if (spa_multihost(spa) && spa_suspended(spa) &&
 	    spa_mmp_remote_host_activity(spa)) {
 		spa_close(spa, FTAG);
 		return (SET_ERROR(EREMOTEIO));
 	}
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
 	} else {
 		vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE);
 		if (vd == NULL) {
 			error = SET_ERROR(ENODEV);
 			(void) spa_vdev_state_exit(spa, NULL, error);
 			spa_close(spa, FTAG);
 			return (error);
 		}
 	}
 
 	vdev_clear(spa, vd);
 
 	(void) spa_vdev_state_exit(spa, spa_suspended(spa) ?
 	    NULL : spa->spa_root_vdev, 0);
 
 	/*
 	 * Resume any suspended I/Os.
 	 */
 	if (zio_resume(spa) != 0)
 		error = SET_ERROR(EIO);
 
 	spa_close(spa, FTAG);
 
 	return (error);
 }
 
 /*
  * Reopen all the vdevs associated with the pool.
  *
  * innvl: {
  *  "scrub_restart" -> when true and scrub is running, allow to restart
  *              scrub as the side effect of the reopen (boolean).
  * }
  *
  * outnvl is unused
  */
 static const zfs_ioc_key_t zfs_keys_pool_reopen[] = {
 	{"scrub_restart",	DATA_TYPE_BOOLEAN_VALUE,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) outnvl;
 	spa_t *spa;
 	int error;
 	boolean_t rc, scrub_restart = B_TRUE;
 
 	if (innvl) {
 		error = nvlist_lookup_boolean_value(innvl,
 		    "scrub_restart", &rc);
 		if (error == 0)
 			scrub_restart = rc;
 	}
 
 	error = spa_open(pool, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	spa_vdev_state_enter(spa, SCL_NONE);
 
 	/*
 	 * If the scrub_restart flag is B_FALSE and a scrub is already
 	 * in progress then set spa_scrub_reopen flag to B_TRUE so that
 	 * we don't restart the scrub as a side effect of the reopen.
 	 * Otherwise, let vdev_open() decided if a resilver is required.
 	 */
 
 	spa->spa_scrub_reopen = (!scrub_restart &&
 	    dsl_scan_scrubbing(spa->spa_dsl_pool));
 	vdev_reopen(spa->spa_root_vdev);
 	spa->spa_scrub_reopen = B_FALSE;
 
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 	spa_close(spa, FTAG);
 	return (0);
 }
 
 /*
  * inputs:
  * zc_name	name of filesystem
  *
  * outputs:
  * zc_string	name of conflicting snapshot, if there is one
  */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds, *ods;
 	char origin[ZFS_MAX_DATASET_NAME_LEN];
 	char *cp;
 	int error;
 
 	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 	if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 ||
 	    strchr(zc->zc_name, '%'))
 		return (SET_ERROR(EINVAL));
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	if (!dsl_dir_is_clone(ds->ds_dir)) {
 		dsl_dataset_rele(ds, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dsl_dataset_hold_obj(dp,
 	    dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods);
 	if (error != 0) {
 		dsl_dataset_rele(ds, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	dsl_dataset_name(ods, origin);
 	dsl_dataset_rele(ods, FTAG);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_pool_rele(dp, FTAG);
 
 	/*
 	 * We don't need to unmount *all* the origin fs's snapshots, but
 	 * it's easier.
 	 */
 	cp = strchr(origin, '@');
 	if (cp)
 		*cp = '\0';
 	(void) dmu_objset_find(origin,
 	    zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS);
 	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
 }
 
 /*
  * Retrieve a single {user|group|project}{used|quota}@... property.
  *
  * inputs:
  * zc_name	name of filesystem
  * zc_objset_type zfs_userquota_prop_t
  * zc_value	domain name (eg. "S-1-234-567-89")
  * zc_guid	RID/UID/GID
  *
  * outputs:
  * zc_cookie	property value
  */
 static int
 zfs_ioc_userspace_one(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
 		return (SET_ERROR(EINVAL));
 
 	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	error = zfs_userspace_one(zfsvfs,
 	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
 	zfsvfs_rele(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_cookie		zap cursor
  * zc_objset_type	zfs_userquota_prop_t
  * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
  *
  * outputs:
  * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
  * zc_cookie	zap cursor
  */
 static int
 zfs_ioc_userspace_many(zfs_cmd_t *zc)
 {
 	zfsvfs_t *zfsvfs;
 	int bufsize = zc->zc_nvlist_dst_size;
 
 	if (bufsize <= 0)
 		return (SET_ERROR(ENOMEM));
 
 	int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE);
 	if (error != 0)
 		return (error);
 
 	void *buf = vmem_alloc(bufsize, KM_SLEEP);
 
 	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
 	    buf, &zc->zc_nvlist_dst_size, &zc->zc_guid);
 
 	if (error == 0) {
 		error = xcopyout(buf,
 		    (void *)(uintptr_t)zc->zc_nvlist_dst,
 		    zc->zc_nvlist_dst_size);
 	}
 	vmem_free(buf, bufsize);
 	zfsvfs_rele(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * none
  */
 static int
 zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
 {
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 
 	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
 		if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
 			/*
 			 * If userused is not enabled, it may be because the
 			 * objset needs to be closed & reopened (to grow the
 			 * objset_phys_t).  Suspend/resume the fs will do that.
 			 */
 			dsl_dataset_t *ds, *newds;
 
 			ds = dmu_objset_ds(zfsvfs->z_os);
 			error = zfs_suspend_fs(zfsvfs);
 			if (error == 0) {
 				dmu_objset_refresh_ownership(ds, &newds,
 				    B_TRUE, zfsvfs);
 				error = zfs_resume_fs(zfsvfs, newds);
 			}
 		}
 		if (error == 0) {
 			mutex_enter(&zfsvfs->z_os->os_upgrade_lock);
 			if (zfsvfs->z_os->os_upgrade_id == 0) {
 				/* clear potential error code and retry */
 				zfsvfs->z_os->os_upgrade_status = 0;
 				mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
 
 				dsl_pool_config_enter(
 				    dmu_objset_pool(zfsvfs->z_os), FTAG);
 				dmu_objset_userspace_upgrade(zfsvfs->z_os);
 				dsl_pool_config_exit(
 				    dmu_objset_pool(zfsvfs->z_os), FTAG);
 			} else {
 				mutex_exit(&zfsvfs->z_os->os_upgrade_lock);
 			}
 
 			taskq_wait_id(zfsvfs->z_os->os_spa->spa_upgrade_taskq,
 			    zfsvfs->z_os->os_upgrade_id);
 			error = zfsvfs->z_os->os_upgrade_status;
 		}
 		zfs_vfs_rele(zfsvfs);
 	} else {
 		objset_t *os;
 
 		/* XXX kind of reading contents without owning */
 		error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
 		if (error != 0)
 			return (error);
 
 		mutex_enter(&os->os_upgrade_lock);
 		if (os->os_upgrade_id == 0) {
 			/* clear potential error code and retry */
 			os->os_upgrade_status = 0;
 			mutex_exit(&os->os_upgrade_lock);
 
 			dmu_objset_userspace_upgrade(os);
 		} else {
 			mutex_exit(&os->os_upgrade_lock);
 		}
 
 		dsl_pool_rele(dmu_objset_pool(os), FTAG);
 
 		taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
 		error = os->os_upgrade_status;
 
 		dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT,
 		    FTAG);
 	}
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  *
  * outputs:
  * none
  */
 static int
 zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc)
 {
 	objset_t *os;
 	int error;
 
 	error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os);
 	if (error != 0)
 		return (error);
 
 	if (dmu_objset_userobjspace_upgradable(os) ||
 	    dmu_objset_projectquota_upgradable(os)) {
 		mutex_enter(&os->os_upgrade_lock);
 		if (os->os_upgrade_id == 0) {
 			/* clear potential error code and retry */
 			os->os_upgrade_status = 0;
 			mutex_exit(&os->os_upgrade_lock);
 
 			dmu_objset_id_quota_upgrade(os);
 		} else {
 			mutex_exit(&os->os_upgrade_lock);
 		}
 
 		dsl_pool_rele(dmu_objset_pool(os), FTAG);
 
 		taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id);
 		error = os->os_upgrade_status;
 	} else {
 		dsl_pool_rele(dmu_objset_pool(os), FTAG);
 	}
 
 	dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG);
 
 	return (error);
 }
 
 static int
 zfs_ioc_share(zfs_cmd_t *zc)
 {
 	return (SET_ERROR(ENOSYS));
 }
 
 /*
  * inputs:
  * zc_name		name of containing filesystem
  * zc_obj		object # beyond which we want next in-use object #
  *
  * outputs:
  * zc_obj		next in-use object #
  */
 static int
 zfs_ioc_next_obj(zfs_cmd_t *zc)
 {
 	objset_t *os = NULL;
 	int error;
 
 	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error != 0)
 		return (error);
 
 	error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0);
 
 	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of filesystem
  * zc_value		prefix name for snapshot
  * zc_cleanup_fd	cleanup-on-exit file descriptor for calling process
  *
  * outputs:
  * zc_value		short name of new snapshot
  */
 static int
 zfs_ioc_tmp_snapshot(zfs_cmd_t *zc)
 {
 	char *snap_name;
 	char *hold_name;
 	minor_t minor;
 
 	zfs_file_t *fp = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor);
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	snap_name = kmem_asprintf("%s-%016llx", zc->zc_value,
 	    (u_longlong_t)ddi_get_lbolt64());
 	hold_name = kmem_asprintf("%%%s", zc->zc_value);
 
 	int error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor,
 	    hold_name);
 	if (error == 0)
 		(void) strlcpy(zc->zc_value, snap_name,
 		    sizeof (zc->zc_value));
 	kmem_strfree(snap_name);
 	kmem_strfree(hold_name);
 	zfs_onexit_fd_rele(fp);
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of "to" snapshot
  * zc_value		name of "from" snapshot
  * zc_cookie		file descriptor to write diff data on
  *
  * outputs:
  * dmu_diff_record_t's to the file descriptor
  */
 static int
 zfs_ioc_diff(zfs_cmd_t *zc)
 {
 	zfs_file_t *fp;
 	offset_t off;
 	int error;
 
 	if ((fp = zfs_file_get(zc->zc_cookie)) == NULL)
 		return (SET_ERROR(EBADF));
 
 	off = zfs_file_off(fp);
 	error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off);
 
 	zfs_file_put(fp);
 
 	return (error);
 }
 
 static int
 zfs_ioc_smb_acl(zfs_cmd_t *zc)
 {
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * innvl: {
  *     "holds" -> { snapname -> holdname (string), ... }
  *     (optional) "cleanup_fd" -> fd (int32)
  * }
  *
  * outnvl: {
  *     snapname -> error value (int32)
  *     ...
  * }
  */
 static const zfs_ioc_key_t zfs_keys_hold[] = {
 	{"holds",		DATA_TYPE_NVLIST,	0},
 	{"cleanup_fd",		DATA_TYPE_INT32,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
 {
 	(void) pool;
 	nvpair_t *pair;
 	nvlist_t *holds;
 	int cleanup_fd = -1;
 	int error;
 	minor_t minor = 0;
 	zfs_file_t *fp = NULL;
 
 	holds = fnvlist_lookup_nvlist(args, "holds");
 
 	/* make sure the user didn't pass us any invalid (empty) tags */
 	for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
 	    pair = nvlist_next_nvpair(holds, pair)) {
 		const char *htag;
 
 		error = nvpair_value_string(pair, &htag);
 		if (error != 0)
 			return (SET_ERROR(error));
 
 		if (strlen(htag) == 0)
 			return (SET_ERROR(EINVAL));
 	}
 
 	if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
 		fp = zfs_onexit_fd_hold(cleanup_fd, &minor);
 		if (fp == NULL)
 			return (SET_ERROR(EBADF));
 	}
 
 	error = dsl_dataset_user_hold(holds, minor, errlist);
 	if (fp != NULL) {
 		ASSERT3U(minor, !=, 0);
 		zfs_onexit_fd_rele(fp);
 	}
 	return (SET_ERROR(error));
 }
 
 /*
  * innvl is not used.
  *
  * outnvl: {
  *    holdname -> time added (uint64 seconds since epoch)
  *    ...
  * }
  */
 static const zfs_ioc_key_t zfs_keys_get_holds[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl)
 {
 	(void) args;
 	return (dsl_dataset_get_holds(snapname, outnvl));
 }
 
 /*
  * innvl: {
  *     snapname -> { holdname, ... }
  *     ...
  * }
  *
  * outnvl: {
  *     snapname -> error value (int32)
  *     ...
  * }
  */
 static const zfs_ioc_key_t zfs_keys_release[] = {
 	{"<snapname>...",	DATA_TYPE_NVLIST,	ZK_WILDCARDLIST},
 };
 
 static int
 zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
 {
 	(void) pool;
 	return (dsl_dataset_user_release(holds, errlist));
 }
 
 /*
  * inputs:
  * zc_guid		flags (ZEVENT_NONBLOCK)
  * zc_cleanup_fd	zevent file descriptor
  *
  * outputs:
  * zc_nvlist_dst	next nvlist event
  * zc_cookie		dropped events since last get
  */
 static int
 zfs_ioc_events_next(zfs_cmd_t *zc)
 {
 	zfs_zevent_t *ze;
 	nvlist_t *event = NULL;
 	minor_t minor;
 	uint64_t dropped = 0;
 	int error;
 
 	zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	do {
 		error = zfs_zevent_next(ze, &event,
 		    &zc->zc_nvlist_dst_size, &dropped);
 		if (event != NULL) {
 			zc->zc_cookie = dropped;
 			error = put_nvlist(zc, event);
 			nvlist_free(event);
 		}
 
 		if (zc->zc_guid & ZEVENT_NONBLOCK)
 			break;
 
 		if ((error == 0) || (error != ENOENT))
 			break;
 
 		error = zfs_zevent_wait(ze);
 		if (error != 0)
 			break;
 	} while (1);
 
 	zfs_zevent_fd_rele(fp);
 
 	return (error);
 }
 
 /*
  * outputs:
  * zc_cookie		cleared events count
  */
 static int
 zfs_ioc_events_clear(zfs_cmd_t *zc)
 {
 	uint_t count;
 
 	zfs_zevent_drain_all(&count);
 	zc->zc_cookie = count;
 
 	return (0);
 }
 
 /*
  * inputs:
  * zc_guid		eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END
  * zc_cleanup		zevent file descriptor
  */
 static int
 zfs_ioc_events_seek(zfs_cmd_t *zc)
 {
 	zfs_zevent_t *ze;
 	minor_t minor;
 	int error;
 
 	zfs_file_t *fp = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze);
 	if (fp == NULL)
 		return (SET_ERROR(EBADF));
 
 	error = zfs_zevent_seek(ze, zc->zc_guid);
 	zfs_zevent_fd_rele(fp);
 
 	return (error);
 }
 
 /*
  * inputs:
  * zc_name		name of later filesystem or snapshot
  * zc_value		full name of old snapshot or bookmark
  *
  * outputs:
  * zc_cookie		space in bytes
  * zc_objset_type	compressed space in bytes
  * zc_perm_action	uncompressed space in bytes
  */
 static int
 zfs_ioc_space_written(zfs_cmd_t *zc)
 {
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new;
 
 	error = dsl_pool_hold(zc->zc_name, FTAG, &dp);
 	if (error != 0)
 		return (error);
 	error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	if (strchr(zc->zc_value, '#') != NULL) {
 		zfs_bookmark_phys_t bmp;
 		error = dsl_bookmark_lookup(dp, zc->zc_value,
 		    new, &bmp);
 		if (error == 0) {
 			error = dsl_dataset_space_written_bookmark(&bmp, new,
 			    &zc->zc_cookie,
 			    &zc->zc_objset_type, &zc->zc_perm_action);
 		}
 	} else {
 		dsl_dataset_t *old;
 		error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old);
 
 		if (error == 0) {
 			error = dsl_dataset_space_written(old, new,
 			    &zc->zc_cookie,
 			    &zc->zc_objset_type, &zc->zc_perm_action);
 			dsl_dataset_rele(old, FTAG);
 		}
 	}
 	dsl_dataset_rele(new, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "firstsnap" -> snapshot name
  * }
  *
  * outnvl: {
  *     "used" -> space in bytes
  *     "compressed" -> compressed space in bytes
  *     "uncompressed" -> uncompressed space in bytes
  * }
  */
 static const zfs_ioc_key_t zfs_keys_space_snaps[] = {
 	{"firstsnap",	DATA_TYPE_STRING,	0},
 };
 
 static int
 zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	int error;
 	dsl_pool_t *dp;
 	dsl_dataset_t *new, *old;
 	const char *firstsnap;
 	uint64_t used, comp, uncomp;
 
 	firstsnap = fnvlist_lookup_string(innvl, "firstsnap");
 
 	error = dsl_pool_hold(lastsnap, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
 	if (error == 0 && !new->ds_is_snapshot) {
 		dsl_dataset_rele(new, FTAG);
 		error = SET_ERROR(EINVAL);
 	}
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
 	if (error == 0 && !old->ds_is_snapshot) {
 		dsl_dataset_rele(old, FTAG);
 		error = SET_ERROR(EINVAL);
 	}
 	if (error != 0) {
 		dsl_dataset_rele(new, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 
 	error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp);
 	dsl_dataset_rele(old, FTAG);
 	dsl_dataset_rele(new, FTAG);
 	dsl_pool_rele(dp, FTAG);
 	fnvlist_add_uint64(outnvl, "used", used);
 	fnvlist_add_uint64(outnvl, "compressed", comp);
 	fnvlist_add_uint64(outnvl, "uncompressed", uncomp);
 	return (error);
 }
 
 /*
  * innvl: {
  *     "fd" -> file descriptor to write stream to (int32)
  *     (optional) "fromsnap" -> full snap name to send an incremental from
  *     (optional) "largeblockok" -> (value ignored)
  *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  *     (optional) "compressok" -> (value ignored)
  *         presence indicates compressed DRR_WRITE records are permitted
  *     (optional) "rawok" -> (value ignored)
  *         presence indicates raw encrypted records should be used.
  *     (optional) "savedok" -> (value ignored)
  *         presence indicates we should send a partially received snapshot
  *     (optional) "resume_object" and "resume_offset" -> (uint64)
  *         if present, resume send stream from specified object and offset.
  *     (optional) "redactbook" -> (string)
  *         if present, use this bookmark's redaction list to generate a redacted
  *         send stream
  * }
  *
  * outnvl is unused
  */
 static const zfs_ioc_key_t zfs_keys_send_new[] = {
 	{"fd",			DATA_TYPE_INT32,	0},
 	{"fromsnap",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 	{"largeblockok",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"embedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"compressok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"rawok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"savedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"resume_object",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"resume_offset",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"redactbook",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) outnvl;
 	int error;
 	offset_t off;
 	const char *fromname = NULL;
 	int fd;
 	boolean_t largeblockok;
 	boolean_t embedok;
 	boolean_t compressok;
 	boolean_t rawok;
 	boolean_t savedok;
 	uint64_t resumeobj = 0;
 	uint64_t resumeoff = 0;
 	const char *redactbook = NULL;
 
 	fd = fnvlist_lookup_int32(innvl, "fd");
 
 	(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
 
 	largeblockok = nvlist_exists(innvl, "largeblockok");
 	embedok = nvlist_exists(innvl, "embedok");
 	compressok = nvlist_exists(innvl, "compressok");
 	rawok = nvlist_exists(innvl, "rawok");
 	savedok = nvlist_exists(innvl, "savedok");
 
 	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
 	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
 
 	(void) nvlist_lookup_string(innvl, "redactbook", &redactbook);
 
 	dump_bytes_arg_t dba;
 	dmu_send_outparams_t out;
 	error = dump_bytes_init(&dba, fd, &out);
 	if (error)
 		return (error);
 
 	off = zfs_file_off(dba.dba_fp);
 	error = dmu_send(snapname, fromname, embedok, largeblockok,
 	    compressok, rawok, savedok, resumeobj, resumeoff,
 	    redactbook, fd, &off, &out);
 
 	dump_bytes_fini(&dba);
 
 	return (error);
 }
 
 static int
 send_space_sum(objset_t *os, void *buf, int len, void *arg)
 {
 	(void) os, (void) buf;
 	uint64_t *size = arg;
 
 	*size += len;
 	return (0);
 }
 
 /*
  * Determine approximately how large a zfs send stream will be -- the number
  * of bytes that will be written to the fd supplied to zfs_ioc_send_new().
  *
  * innvl: {
  *     (optional) "from" -> full snap or bookmark name to send an incremental
  *                          from
  *     (optional) "largeblockok" -> (value ignored)
  *         indicates that blocks > 128KB are permitted
  *     (optional) "embedok" -> (value ignored)
  *         presence indicates DRR_WRITE_EMBEDDED records are permitted
  *     (optional) "compressok" -> (value ignored)
  *         presence indicates compressed DRR_WRITE records are permitted
  *     (optional) "rawok" -> (value ignored)
  *         presence indicates raw encrypted records should be used.
  *     (optional) "resume_object" and "resume_offset" -> (uint64)
  *         if present, resume send stream from specified object and offset.
  *     (optional) "fd" -> file descriptor to use as a cookie for progress
  *         tracking (int32)
  * }
  *
  * outnvl: {
  *     "space" -> bytes of space (uint64)
  * }
  */
 static const zfs_ioc_key_t zfs_keys_send_space[] = {
 	{"from",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 	{"fromsnap",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 	{"largeblockok",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"embedok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"compressok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"rawok",		DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 	{"fd",			DATA_TYPE_INT32,	ZK_OPTIONAL},
 	{"redactbook",		DATA_TYPE_STRING,	ZK_OPTIONAL},
 	{"resume_object",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"resume_offset",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"bytes",		DATA_TYPE_UINT64,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *tosnap;
 	dsl_dataset_t *fromsnap = NULL;
 	int error;
 	const char *fromname = NULL;
 	const char *redactlist_book = NULL;
 	boolean_t largeblockok;
 	boolean_t embedok;
 	boolean_t compressok;
 	boolean_t rawok;
 	boolean_t savedok;
 	uint64_t space = 0;
 	boolean_t full_estimate = B_FALSE;
 	uint64_t resumeobj = 0;
 	uint64_t resumeoff = 0;
 	uint64_t resume_bytes = 0;
 	int32_t fd = -1;
 	zfs_bookmark_phys_t zbm = {0};
 
 	error = dsl_pool_hold(snapname, FTAG, &dp);
 	if (error != 0)
 		return (error);
 
 	error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap);
 	if (error != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (error);
 	}
 	(void) nvlist_lookup_int32(innvl, "fd", &fd);
 
 	largeblockok = nvlist_exists(innvl, "largeblockok");
 	embedok = nvlist_exists(innvl, "embedok");
 	compressok = nvlist_exists(innvl, "compressok");
 	rawok = nvlist_exists(innvl, "rawok");
 	savedok = nvlist_exists(innvl, "savedok");
 	boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0);
 	boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook",
 	    &redactlist_book) == 0);
 
 	(void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
 	(void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
 	(void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes);
 
 	if (altbook) {
 		full_estimate = B_TRUE;
 	} else if (from) {
 		if (strchr(fromname, '#')) {
 			error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm);
 
 			/*
 			 * dsl_bookmark_lookup() will fail with EXDEV if
 			 * the from-bookmark and tosnap are at the same txg.
 			 * However, it's valid to do a send (and therefore,
 			 * a send estimate) from and to the same time point,
 			 * if the bookmark is redacted (the incremental send
 			 * can change what's redacted on the target).  In
 			 * this case, dsl_bookmark_lookup() fills in zbm
 			 * but returns EXDEV.  Ignore this error.
 			 */
 			if (error == EXDEV && zbm.zbm_redaction_obj != 0 &&
 			    zbm.zbm_guid ==
 			    dsl_dataset_phys(tosnap)->ds_guid)
 				error = 0;
 
 			if (error != 0) {
 				dsl_dataset_rele(tosnap, FTAG);
 				dsl_pool_rele(dp, FTAG);
 				return (error);
 			}
 			if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags &
 			    ZBM_FLAG_HAS_FBN)) {
 				full_estimate = B_TRUE;
 			}
 		} else if (strchr(fromname, '@')) {
 			error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
 			if (error != 0) {
 				dsl_dataset_rele(tosnap, FTAG);
 				dsl_pool_rele(dp, FTAG);
 				return (error);
 			}
 
 			if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
 				full_estimate = B_TRUE;
 				dsl_dataset_rele(fromsnap, FTAG);
 			}
 		} else {
 			/*
 			 * from is not properly formatted as a snapshot or
 			 * bookmark
 			 */
 			dsl_dataset_rele(tosnap, FTAG);
 			dsl_pool_rele(dp, FTAG);
 			return (SET_ERROR(EINVAL));
 		}
 	}
 
 	if (full_estimate) {
 		dmu_send_outparams_t out = {0};
 		offset_t off = 0;
 		out.dso_outfunc = send_space_sum;
 		out.dso_arg = &space;
 		out.dso_dryrun = B_TRUE;
 		/*
 		 * We have to release these holds so dmu_send can take them.  It
 		 * will do all the error checking we need.
 		 */
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 		error = dmu_send(snapname, fromname, embedok, largeblockok,
 		    compressok, rawok, savedok, resumeobj, resumeoff,
 		    redactlist_book, fd, &off, &out);
 	} else {
 		error = dmu_send_estimate_fast(tosnap, fromsnap,
 		    (from && strchr(fromname, '#') != NULL ? &zbm : NULL),
 		    compressok || rawok, savedok, &space);
 		space -= resume_bytes;
 		if (fromsnap != NULL)
 			dsl_dataset_rele(fromsnap, FTAG);
 		dsl_dataset_rele(tosnap, FTAG);
 		dsl_pool_rele(dp, FTAG);
 	}
 
 	fnvlist_add_uint64(outnvl, "space", space);
 
 	return (error);
 }
 
 /*
  * Sync the currently open TXG to disk for the specified pool.
  * This is somewhat similar to 'zfs_sync()'.
  * For cases that do not result in error this ioctl will wait for
  * the currently open TXG to commit before returning back to the caller.
  *
  * innvl: {
  *  "force" -> when true, force uberblock update even if there is no dirty data.
  *             In addition this will cause the vdev configuration to be written
  *             out including updating the zpool cache file. (boolean_t)
  * }
  *
  * onvl is unused
  */
 static const zfs_ioc_key_t zfs_keys_pool_sync[] = {
 	{"force",	DATA_TYPE_BOOLEAN_VALUE,	0},
 };
 
 static int
 zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl)
 {
 	(void) onvl;
 	int err;
 	boolean_t rc, force = B_FALSE;
 	spa_t *spa;
 
 	if ((err = spa_open(pool, &spa, FTAG)) != 0)
 		return (err);
 
 	if (innvl) {
 		err = nvlist_lookup_boolean_value(innvl, "force", &rc);
 		if (err == 0)
 			force = rc;
 	}
 
 	if (force) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER);
 		vdev_config_dirty(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
 	spa_close(spa, FTAG);
 
 	return (0);
 }
 
 /*
  * Load a user's wrapping key into the kernel.
  * innvl: {
  *     "hidden_args" -> { "wkeydata" -> value }
  *         raw uint8_t array of encryption wrapping key data (32 bytes)
  *     (optional) "noop" -> (value ignored)
  *         presence indicated key should only be verified, not loaded
  * }
  */
 static const zfs_ioc_key_t zfs_keys_load_key[] = {
 	{"hidden_args",	DATA_TYPE_NVLIST,	0},
 	{"noop",	DATA_TYPE_BOOLEAN,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) outnvl;
 	int ret;
 	dsl_crypto_params_t *dcp = NULL;
 	nvlist_t *hidden_args;
 	boolean_t noop = nvlist_exists(innvl, "noop");
 
 	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
 		ret = SET_ERROR(EINVAL);
 		goto error;
 	}
 
 	hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS);
 
 	ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL,
 	    hidden_args, &dcp);
 	if (ret != 0)
 		goto error;
 
 	ret = spa_keystore_load_wkey(dsname, dcp, noop);
 	if (ret != 0)
 		goto error;
 
 	dsl_crypto_params_free(dcp, noop);
 
 	return (0);
 
 error:
 	dsl_crypto_params_free(dcp, B_TRUE);
 	return (ret);
 }
 
 /*
  * Unload a user's wrapping key from the kernel.
  * Both innvl and outnvl are unused.
  */
 static const zfs_ioc_key_t zfs_keys_unload_key[] = {
 	/* no nvl keys */
 };
 
 static int
 zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) innvl, (void) outnvl;
 	int ret = 0;
 
 	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
 		ret = (SET_ERROR(EINVAL));
 		goto out;
 	}
 
 	ret = spa_keystore_unload_wkey(dsname);
 	if (ret != 0)
 		goto out;
 
 out:
 	return (ret);
 }
 
 /*
  * Changes a user's wrapping key used to decrypt a dataset. The keyformat,
  * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified
  * here to change how the key is derived in userspace.
  *
  * innvl: {
  *    "hidden_args" (optional) -> { "wkeydata" -> value }
  *         raw uint8_t array of new encryption wrapping key data (32 bytes)
  *    "props" (optional) -> { prop -> value }
  * }
  *
  * outnvl is unused
  */
 static const zfs_ioc_key_t zfs_keys_change_key[] = {
 	{"crypt_cmd",	DATA_TYPE_UINT64,	ZK_OPTIONAL},
 	{"hidden_args",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 	{"props",	DATA_TYPE_NVLIST,	ZK_OPTIONAL},
 };
 
 static int
 zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl)
 {
 	(void) outnvl;
 	int ret;
 	uint64_t cmd = DCP_CMD_NONE;
 	dsl_crypto_params_t *dcp = NULL;
 	nvlist_t *args = NULL, *hidden_args = NULL;
 
 	if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) {
 		ret = (SET_ERROR(EINVAL));
 		goto error;
 	}
 
 	(void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd);
 	(void) nvlist_lookup_nvlist(innvl, "props", &args);
 	(void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args);
 
 	ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp);
 	if (ret != 0)
 		goto error;
 
 	ret = spa_keystore_change_key(dsname, dcp);
 	if (ret != 0)
 		goto error;
 
 	dsl_crypto_params_free(dcp, B_FALSE);
 
 	return (0);
 
 error:
 	dsl_crypto_params_free(dcp, B_TRUE);
 	return (ret);
 }
 
 static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
 
 static void
 zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     boolean_t log_history, zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
 	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
 	ASSERT3U(ioc, <, ZFS_IOC_LAST);
 	ASSERT0P(vec->zvec_legacy_func);
 	ASSERT0P(vec->zvec_func);
 
 	vec->zvec_legacy_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
 	vec->zvec_allow_log = log_history;
 	vec->zvec_pool_check = pool_check;
 }
 
 /*
  * See the block comment at the beginning of this file for details on
  * each argument to this function.
  */
 void
 zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck,
     zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist,
     boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys)
 {
 	zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST];
 
 	ASSERT3U(ioc, >=, ZFS_IOC_FIRST);
 	ASSERT3U(ioc, <, ZFS_IOC_LAST);
 	ASSERT0P(vec->zvec_legacy_func);
 	ASSERT0P(vec->zvec_func);
 
 	/* if we are logging, the name must be valid */
 	ASSERT(!allow_log || namecheck != NO_NAME);
 
 	vec->zvec_name = name;
 	vec->zvec_func = func;
 	vec->zvec_secpolicy = secpolicy;
 	vec->zvec_namecheck = namecheck;
 	vec->zvec_pool_check = pool_check;
 	vec->zvec_smush_outnvlist = smush_outnvlist;
 	vec->zvec_allow_log = allow_log;
 	vec->zvec_nvl_keys = nvl_keys;
 	vec->zvec_nvl_key_count = num_keys;
 }
 
 static void
 zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, boolean_t log_history,
     zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    POOL_NAME, log_history, pool_check);
 }
 
 void
 zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, pool_check);
 }
 
 static void
 zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
 	zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config,
 	    POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
 static void
 zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    NO_NAME, B_FALSE, POOL_CHECK_NONE);
 }
 
 static void
 zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc,
     zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED);
 }
 
 static void
 zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func)
 {
 	zfs_ioctl_register_dataset_read_secpolicy(ioc, func,
 	    zfs_secpolicy_read);
 }
 
 static void
 zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
     zfs_secpolicy_func_t *secpolicy)
 {
 	zfs_ioctl_register_legacy(ioc, func, secpolicy,
 	    DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 }
 
 static void
 zfs_ioctl_init(void)
 {
 	zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT,
 	    zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot));
 
 	zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY,
 	    zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
 	    zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history));
 
 	zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS,
 	    zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
 	    zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps));
 
 	zfs_ioctl_register("send", ZFS_IOC_SEND_NEW,
 	    zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
 	    zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new));
 
 	zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE,
 	    zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
 	    zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space));
 
 	zfs_ioctl_register("create", ZFS_IOC_CREATE,
 	    zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_create, ARRAY_SIZE(zfs_keys_create));
 
 	zfs_ioctl_register("clone", ZFS_IOC_CLONE,
 	    zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone));
 
 	zfs_ioctl_register("remap", ZFS_IOC_REMAP,
 	    zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
 	    zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap));
 
 	zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS,
 	    zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps));
 
 	zfs_ioctl_register("hold", ZFS_IOC_HOLD,
 	    zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold));
 	zfs_ioctl_register("release", ZFS_IOC_RELEASE,
 	    zfs_ioc_release, zfs_secpolicy_release, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_release, ARRAY_SIZE(zfs_keys_release));
 
 	zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS,
 	    zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
 	    zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds));
 
 	zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
 	    zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
 	    zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback));
 
 	zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
 	    zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark));
 
 	zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
 	    zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE,
 	    zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks));
 
 	zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS,
 	    zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props,
 	    ARRAY_SIZE(zfs_keys_get_bookmark_props));
 
 	zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
 	    zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
 	    POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_destroy_bookmarks,
 	    ARRAY_SIZE(zfs_keys_destroy_bookmarks));
 
 	zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW,
 	    zfs_ioc_recv_new, zfs_secpolicy_recv, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new));
 	zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY,
 	    zfs_ioc_load_key, zfs_secpolicy_load_key,
 	    DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
 	    zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key));
 	zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY,
 	    zfs_ioc_unload_key, zfs_secpolicy_load_key,
 	    DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
 	    zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key));
 	zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY,
 	    zfs_ioc_change_key, zfs_secpolicy_change_key,
 	    DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY,
 	    B_TRUE, B_TRUE, zfs_keys_change_key,
 	    ARRAY_SIZE(zfs_keys_change_key));
 
 	zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC,
 	    zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
 	    zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync));
 	zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
 	    zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE,
 	    B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen));
 
 	zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM,
 	    zfs_ioc_channel_program, zfs_secpolicy_config,
 	    POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE,
 	    B_TRUE, zfs_keys_channel_program,
 	    ARRAY_SIZE(zfs_keys_channel_program));
 
 	zfs_ioctl_register("redact", ZFS_IOC_REDACT,
 	    zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact));
 
 	zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT,
 	    zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint));
 
 	zfs_ioctl_register("zpool_discard_checkpoint",
 	    ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint,
 	    zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_discard_checkpoint,
 	    ARRAY_SIZE(zfs_keys_pool_discard_checkpoint));
 
 	zfs_ioctl_register("zpool_prefetch",
 	    ZFS_IOC_POOL_PREFETCH, zfs_ioc_pool_prefetch,
 	    zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE,
 	    zfs_keys_pool_prefetch, ARRAY_SIZE(zfs_keys_pool_prefetch));
 
 	zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE,
 	    zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize));
 
 	zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM,
 	    zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim));
 
 	zfs_ioctl_register("wait", ZFS_IOC_WAIT,
 	    zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
 	    zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait));
 
 	zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS,
 	    zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
 	    zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait));
 
 	zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV,
 	    zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE,
 	    zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv));
 
 	zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV,
 	    zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME,
 	    POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE,
 	    zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv));
 
 	zfs_ioctl_register("zpool_vdev_get_props", ZFS_IOC_VDEV_GET_PROPS,
 	    zfs_ioc_vdev_get_props, zfs_secpolicy_read, POOL_NAME,
 	    POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_vdev_get_props,
 	    ARRAY_SIZE(zfs_keys_vdev_get_props));
 
 	zfs_ioctl_register("zpool_vdev_set_props", ZFS_IOC_VDEV_SET_PROPS,
 	    zfs_ioc_vdev_set_props, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
 	    zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props));
 
 	zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
 	    zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_NONE, B_TRUE, B_TRUE,
 	    zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
 
 	zfs_ioctl_register("get_props", ZFS_IOC_POOL_GET_PROPS,
 	    zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME,
 	    POOL_CHECK_NONE, B_FALSE, B_FALSE,
 	    zfs_keys_get_props, ARRAY_SIZE(zfs_keys_get_props));
 
 	zfs_ioctl_register("zpool_ddt_prune", ZFS_IOC_DDT_PRUNE,
 	    zfs_ioc_ddt_prune, zfs_secpolicy_config, POOL_NAME,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE,
 	    zfs_keys_ddt_prune, ARRAY_SIZE(zfs_keys_ddt_prune));
 
 	/* IOCTLS that use the legacy function signature */
 
 	zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
 	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN,
 	    zfs_ioc_pool_scan);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE,
 	    zfs_ioc_pool_upgrade);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD,
 	    zfs_ioc_vdev_add);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE,
 	    zfs_ioc_vdev_remove);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE,
 	    zfs_ioc_vdev_set_state);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH,
 	    zfs_ioc_vdev_attach);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH,
 	    zfs_ioc_vdev_detach);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH,
 	    zfs_ioc_vdev_setpath);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU,
 	    zfs_ioc_vdev_setfru);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS,
 	    zfs_ioc_pool_set_props);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT,
 	    zfs_ioc_vdev_split);
 	zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID,
 	    zfs_ioc_pool_reguid);
 
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS,
 	    zfs_ioc_pool_configs, zfs_secpolicy_none);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT,
 	    zfs_ioc_pool_tryimport, zfs_secpolicy_config);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT,
 	    zfs_ioc_inject_fault, zfs_secpolicy_inject);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT,
 	    zfs_ioc_clear_fault, zfs_secpolicy_inject);
 	zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT,
 	    zfs_ioc_inject_list_next, zfs_secpolicy_inject);
 
 	/*
 	 * pool destroy, and export don't log the history as part of
 	 * zfsdev_ioctl, but rather zfs_ioc_pool_export
 	 * does the logging of those commands.
 	 */
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats,
 	    zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
 	    zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED);
 	zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
 	    zfs_ioc_dsobj_to_dsname,
 	    zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
 	    zfs_ioc_pool_get_history,
 	    zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
 
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY);
 
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN,
 	    zfs_ioc_space_written);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS,
 	    zfs_ioc_objset_recvd_props);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ,
 	    zfs_ioc_next_obj);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL,
 	    zfs_ioc_get_fsacl);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS,
 	    zfs_ioc_objset_stats);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS,
 	    zfs_ioc_objset_zplprops);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT,
 	    zfs_ioc_dataset_list_next);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	    zfs_ioc_snapshot_list_next);
 	zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS,
 	    zfs_ioc_send_progress);
 
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF,
 	    zfs_ioc_diff, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS,
 	    zfs_ioc_obj_to_stats, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH,
 	    zfs_ioc_obj_to_path, zfs_secpolicy_diff);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE,
 	    zfs_ioc_userspace_one, zfs_secpolicy_userspace_one);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY,
 	    zfs_ioc_userspace_many, zfs_secpolicy_userspace_many);
 	zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND,
 	    zfs_ioc_send, zfs_secpolicy_send);
 
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop,
 	    zfs_secpolicy_none);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
 	    zfs_secpolicy_destroy);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
 	    zfs_secpolicy_rename);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
 	    zfs_secpolicy_recv);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote,
 	    zfs_secpolicy_promote);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP,
 	    zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop);
 	zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl,
 	    zfs_secpolicy_set_fsacl);
 
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share,
 	    zfs_secpolicy_share, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl,
 	    zfs_secpolicy_smb_acl, POOL_CHECK_NONE);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE,
 	    zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 	zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT,
 	    zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot,
 	    POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
 
 	zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next,
 	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear,
 	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
 	zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek,
 	    zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE);
 
 	zfs_ioctl_init_os();
 }
 
 /*
  * Verify that for non-legacy ioctls the input nvlist
  * pairs match against the expected input.
  *
  * Possible errors are:
  * ZFS_ERR_IOC_ARG_UNAVAIL	An unrecognized nvpair was encountered
  * ZFS_ERR_IOC_ARG_REQUIRED	A required nvpair is missing
  * ZFS_ERR_IOC_ARG_BADTYPE	Invalid type for nvpair
  */
 static int
 zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec)
 {
 	const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys;
 	boolean_t required_keys_found = B_FALSE;
 
 	/*
 	 * examine each input pair
 	 */
 	for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
 		const char *name = nvpair_name(pair);
 		data_type_t type = nvpair_type(pair);
 		boolean_t identified = B_FALSE;
 
 		/*
 		 * check pair against the documented names and type
 		 */
 		for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
 			/* if not a wild card name, check for an exact match */
 			if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 &&
 			    strcmp(nvl_keys[k].zkey_name, name) != 0)
 				continue;
 
 			identified = B_TRUE;
 
 			if (nvl_keys[k].zkey_type != DATA_TYPE_ANY &&
 			    nvl_keys[k].zkey_type != type) {
 				return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE));
 			}
 
 			if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
 				continue;
 
 			required_keys_found = B_TRUE;
 			break;
 		}
 
 		/* allow an 'optional' key, everything else is invalid */
 		if (!identified &&
 		    (strcmp(name, "optional") != 0 ||
 		    type != DATA_TYPE_NVLIST)) {
 			return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL));
 		}
 	}
 
 	/* verify that all required keys were found */
 	for (int k = 0; k < vec->zvec_nvl_key_count; k++) {
 		if (nvl_keys[k].zkey_flags & ZK_OPTIONAL)
 			continue;
 
 		if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) {
 			/* at least one non-optional key is expected here */
 			if (!required_keys_found)
 				return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
 			continue;
 		}
 
 		if (!nvlist_exists(innvl, nvl_keys[k].zkey_name))
 			return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED));
 	}
 
 	return (0);
 }
 
 static int
 pool_status_check(const char *name, zfs_ioc_namecheck_t type,
     zfs_ioc_poolcheck_t check)
 {
 	spa_t *spa;
 	int error;
 
 	ASSERT(type == POOL_NAME || type == DATASET_NAME ||
 	    type == ENTITY_NAME);
 
 	if (check & POOL_CHECK_NONE)
 		return (0);
 
 	error = spa_open(name, &spa, FTAG);
 	if (error == 0) {
 		if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa))
 			error = SET_ERROR(EAGAIN);
 		else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa))
 			error = SET_ERROR(EROFS);
 		spa_close(spa, FTAG);
 	}
 	return (error);
 }
 
 int
 zfsdev_getminor(zfs_file_t *fp, minor_t *minorp)
 {
 	zfsdev_state_t *zs, *fpd;
 
 	ASSERT(!MUTEX_HELD(&zfsdev_state_lock));
 
 	fpd = zfs_file_private(fp);
 	if (fpd == NULL)
 		return (SET_ERROR(EBADF));
 
 	mutex_enter(&zfsdev_state_lock);
 
 	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
 
 		if (zs->zs_minor == -1)
 			continue;
 
 		if (fpd == zs) {
 			*minorp = fpd->zs_minor;
 			mutex_exit(&zfsdev_state_lock);
 			return (0);
 		}
 	}
 
 	mutex_exit(&zfsdev_state_lock);
 
 	return (SET_ERROR(EBADF));
 }
 
 void *
 zfsdev_get_state(minor_t minor, enum zfsdev_state_type which)
 {
 	zfsdev_state_t *zs;
 
 	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
 		if (zs->zs_minor == minor) {
 			membar_consumer();
 			switch (which) {
 			case ZST_ONEXIT:
 				return (zs->zs_onexit);
 			case ZST_ZEVENT:
 				return (zs->zs_zevent);
 			case ZST_ALL:
 				return (zs);
 			}
 		}
 	}
 
 	return (NULL);
 }
 
 /*
  * Find a free minor number.  The zfsdev_state_list is expected to
  * be short since it is only a list of currently open file handles.
  */
 static minor_t
 zfsdev_minor_alloc(void)
 {
 	static minor_t last_minor = 0;
 	minor_t m;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	for (m = last_minor + 1; m != last_minor; m++) {
 		if (m > ZFSDEV_MAX_MINOR)
 			m = 1;
 		if (zfsdev_get_state(m, ZST_ALL) == NULL) {
 			last_minor = m;
 			return (m);
 		}
 	}
 
 	return (0);
 }
 
 int
 zfsdev_state_init(void *priv)
 {
 	zfsdev_state_t *zs, *zsprev = NULL;
 	minor_t minor;
 	boolean_t newzs = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&zfsdev_state_lock));
 
 	minor = zfsdev_minor_alloc();
 	if (minor == 0)
 		return (SET_ERROR(ENXIO));
 
 	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zs->zs_next) {
 		if (zs->zs_minor == -1)
 			break;
 		zsprev = zs;
 	}
 
 	if (!zs) {
 		zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP);
 		newzs = B_TRUE;
 	}
 
 	zfsdev_private_set_state(priv, zs);
 
 	zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit);
 	zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent);
 
 	/*
 	 * In order to provide for lock-free concurrent read access
 	 * to the minor list in zfsdev_get_state(), new entries
 	 * must be completely written before linking them into the
 	 * list whereas existing entries are already linked; the last
 	 * operation must be updating zs_minor (from -1 to the new
 	 * value).
 	 */
 	if (newzs) {
 		zs->zs_minor = minor;
 		membar_producer();
 		zsprev->zs_next = zs;
 	} else {
 		membar_producer();
 		zs->zs_minor = minor;
 	}
 
 	return (0);
 }
 
 void
 zfsdev_state_destroy(void *priv)
 {
 	zfsdev_state_t *zs = zfsdev_private_get_state(priv);
 
 	ASSERT(zs != NULL);
 	ASSERT3S(zs->zs_minor, >, 0);
 
 	/*
 	 * The last reference to this zfsdev file descriptor is being dropped.
 	 * We don't have to worry about lookup grabbing this state object, and
 	 * zfsdev_state_init() will not try to reuse this object until it is
 	 * invalidated by setting zs_minor to -1.  Invalidation must be done
 	 * last, with a memory barrier to ensure ordering.  This lets us avoid
 	 * taking the global zfsdev state lock around destruction.
 	 */
 	zfs_onexit_destroy(zs->zs_onexit);
 	zfs_zevent_destroy(zs->zs_zevent);
 	zs->zs_onexit = NULL;
 	zs->zs_zevent = NULL;
 	membar_producer();
 	zs->zs_minor = -1;
 }
 
 long
 zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag)
 {
 	int error, cmd;
 	const zfs_ioc_vec_t *vec;
 	char *saved_poolname = NULL;
 	uint64_t max_nvlist_src_size;
 	size_t saved_poolname_len = 0;
 	nvlist_t *innvl = NULL;
 	fstrans_cookie_t cookie;
 	hrtime_t start_time = gethrtime();
 
 	cmd = vecnum;
 	error = 0;
 	if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
 		return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
 
 	vec = &zfs_ioc_vec[vecnum];
 
 	/*
 	 * The registered ioctl list may be sparse, verify that either
 	 * a normal or legacy handler are registered.
 	 */
 	if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL)
 		return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL));
 
 	zc->zc_iflags = flag & FKIOCTL;
 	max_nvlist_src_size = zfs_max_nvlist_src_size_os();
 	if (zc->zc_nvlist_src_size > max_nvlist_src_size) {
 		/*
 		 * Make sure the user doesn't pass in an insane value for
 		 * zc_nvlist_src_size.  We have to check, since we will end
 		 * up allocating that much memory inside of get_nvlist().  This
 		 * prevents a nefarious user from allocating tons of kernel
 		 * memory.
 		 *
 		 * Also, we return EINVAL instead of ENOMEM here.  The reason
 		 * being that returning ENOMEM from an ioctl() has a special
 		 * connotation; that the user's size value is too small and
 		 * needs to be expanded to hold the nvlist.  See
 		 * zcmd_expand_dst_nvlist() for details.
 		 */
 		error = SET_ERROR(EINVAL);	/* User's size too big */
 
 	} else if (zc->zc_nvlist_src_size != 0) {
 		error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
 		    zc->zc_iflags, &innvl);
 		if (error != 0)
 			goto out;
 	}
 
 	/*
 	 * Ensure that all pool/dataset names are valid before we pass down to
 	 * the lower layers.
 	 */
 	zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
 	switch (vec->zvec_namecheck) {
 	case POOL_NAME:
 		if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 			error = SET_ERROR(EINVAL);
 		else
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		break;
 
 	case DATASET_NAME:
 		if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 			error = SET_ERROR(EINVAL);
 		else
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		break;
 
 	case ENTITY_NAME:
 		if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) {
 			error = SET_ERROR(EINVAL);
 		} else {
 			error = pool_status_check(zc->zc_name,
 			    vec->zvec_namecheck, vec->zvec_pool_check);
 		}
 		break;
 
 	case NO_NAME:
 		break;
 	}
 	/*
 	 * Ensure that all input pairs are valid before we pass them down
 	 * to the lower layers.
 	 *
 	 * The vectored functions can use fnvlist_lookup_{type} for any
 	 * required pairs since zfs_check_input_nvpairs() confirmed that
 	 * they exist and are of the correct type.
 	 */
 	if (error == 0 && vec->zvec_func != NULL) {
 		error = zfs_check_input_nvpairs(innvl, vec);
 		if (error != 0)
 			goto out;
 	}
 
 	if (error == 0) {
 		cookie = spl_fstrans_mark();
 		error = vec->zvec_secpolicy(zc, innvl, CRED());
 		spl_fstrans_unmark(cookie);
 	}
 
 	if (error != 0)
 		goto out;
 
 	/* legacy ioctls can modify zc_name */
 	/*
 	 * Can't use kmem_strdup() as we might truncate the string and
 	 * kmem_strfree() would then free with incorrect size.
 	 */
 	saved_poolname_len = strlen(zc->zc_name) + 1;
 	saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP);
 
 	strlcpy(saved_poolname, zc->zc_name, saved_poolname_len);
 	saved_poolname[strcspn(saved_poolname, "/@#")] = '\0';
 
 	if (vec->zvec_func != NULL) {
 		nvlist_t *outnvl;
 		int puterror = 0;
 		spa_t *spa;
 		nvlist_t *lognv = NULL;
 
 		ASSERT0P(vec->zvec_legacy_func);
 
 		/*
 		 * Add the innvl to the lognv before calling the func,
 		 * in case the func changes the innvl.
 		 */
 		if (vec->zvec_allow_log) {
 			lognv = fnvlist_alloc();
 			fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL,
 			    vec->zvec_name);
 			if (!nvlist_empty(innvl)) {
 				fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL,
 				    innvl);
 			}
 		}
 
 		outnvl = fnvlist_alloc();
 		cookie = spl_fstrans_mark();
 		error = vec->zvec_func(zc->zc_name, innvl, outnvl);
 		spl_fstrans_unmark(cookie);
 
 		/*
 		 * Some commands can partially execute, modify state, and still
 		 * return an error.  In these cases, attempt to record what
 		 * was modified.
 		 */
 		if ((error == 0 ||
 		    (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) &&
 		    vec->zvec_allow_log &&
 		    spa_open(zc->zc_name, &spa, FTAG) == 0) {
 			if (!nvlist_empty(outnvl)) {
 				size_t out_size = fnvlist_size(outnvl);
 				if (out_size > zfs_history_output_max) {
 					fnvlist_add_int64(lognv,
 					    ZPOOL_HIST_OUTPUT_SIZE, out_size);
 				} else {
 					fnvlist_add_nvlist(lognv,
 					    ZPOOL_HIST_OUTPUT_NVL, outnvl);
 				}
 			}
 			if (error != 0) {
 				fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO,
 				    error);
 			}
 			fnvlist_add_int64(lognv, ZPOOL_HIST_ELAPSED_NS,
 			    gethrtime() - start_time);
 			(void) spa_history_log_nvl(spa, lognv);
 			spa_close(spa, FTAG);
 		}
 		fnvlist_free(lognv);
 
 		if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) {
 			int smusherror = 0;
 			if (vec->zvec_smush_outnvlist) {
 				smusherror = nvlist_smush(outnvl,
 				    zc->zc_nvlist_dst_size);
 			}
 			if (smusherror == 0)
 				puterror = put_nvlist(zc, outnvl);
 		}
 
 		if (puterror != 0)
 			error = puterror;
 
 		nvlist_free(outnvl);
 	} else {
 		cookie = spl_fstrans_mark();
 		error = vec->zvec_legacy_func(zc);
 		spl_fstrans_unmark(cookie);
 	}
 
 out:
 	nvlist_free(innvl);
 	if (error == 0 && vec->zvec_allow_log) {
 		char *s = tsd_get(zfs_allow_log_key);
 		if (s != NULL)
 			kmem_strfree(s);
 		(void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname));
 	}
 	if (saved_poolname != NULL)
 		kmem_free(saved_poolname, saved_poolname_len);
 
 	return (error);
 }
 
 int
 zfs_kmod_init(void)
 {
 	int error;
 
 	if ((error = zvol_init()) != 0)
 		return (error);
 
 	spa_init(SPA_MODE_READ | SPA_MODE_WRITE);
 	zfs_init();
 
 	zfs_ioctl_init();
 
 	mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
 	zfsdev_state_listhead.zs_minor = -1;
 
 	if ((error = zfsdev_attach()) != 0)
 		goto out;
 
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 	tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
 
 	return (0);
 out:
 	zfs_fini();
 	spa_fini();
 	zvol_fini();
 
 	return (error);
 }
 
 void
 zfs_kmod_fini(void)
 {
 	zfsdev_state_t *zs, *zsnext = NULL;
 
 	zfsdev_detach();
 
 	mutex_destroy(&zfsdev_state_lock);
 
 	for (zs = &zfsdev_state_listhead; zs != NULL; zs = zsnext) {
 		zsnext = zs->zs_next;
 		if (zs->zs_onexit)
 			zfs_onexit_destroy(zs->zs_onexit);
 		if (zs->zs_zevent)
 			zfs_zevent_destroy(zs->zs_zevent);
 		if (zs != &zfsdev_state_listhead)
 			kmem_free(zs, sizeof (zfsdev_state_t));
 	}
 
 	zfs_ereport_taskq_fini();	/* run before zfs_fini() on Linux */
 	zfs_fini();
 	spa_fini();
 	zvol_fini();
 
 	tsd_destroy(&rrw_tsd_key);
 	tsd_destroy(&zfs_allow_log_key);
 }
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, U64, ZMOD_RW,
 	"Maximum size in bytes allowed for src nvlist passed with ZFS ioctls");
 
 ZFS_MODULE_PARAM(zfs, zfs_, history_output_max, U64, ZMOD_RW,
 	"Maximum size in bytes of ZFS ioctl output that will be logged");
diff --git a/sys/contrib/openzfs/module/zfs/zvol.c b/sys/contrib/openzfs/module/zfs/zvol.c
index 2fd3e1c37045..faced0db7e9e 100644
--- a/sys/contrib/openzfs/module/zfs/zvol.c
+++ b/sys/contrib/openzfs/module/zfs/zvol.c
@@ -1,2202 +1,2217 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * Rewritten for Linux by Brian Behlendorf <behlendorf1@llnl.gov>.
  * LLNL-CODE-403049.
  *
  * ZFS volume emulation driver.
  *
  * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  * Volumes are accessed through the symbolic links named:
  *
  * /dev/<pool_name>/<dataset_name>
  *
  * Volumes are persistent through reboot and module load.  No user command
  * needs to be run before opening and using a device.
  *
  * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
  * Copyright (c) 2024, 2025, Klara, Inc.
  */
 
 /*
  * Note on locking of zvol state structures.
  *
  * zvol_state_t represents the connection between a single dataset
  * (DMU_OST_ZVOL) and the device "minor" (some OS-specific representation of a
  * "disk" or "device" or "volume", eg, a /dev/zdXX node, a GEOM object, etc).
  *
  * The global zvol_state_lock is used to protect access to zvol_state_list and
  * zvol_htable, which are the primary way to obtain a zvol_state_t from a name.
  * It should not be used for anything not name-relateds, and you should avoid
  * sleeping or waiting while its held. See zvol_find_by_name(), zvol_insert(),
  * zvol_remove().
  *
  * The zv_state_lock is used to protect the contents of the associated
  * zvol_state_t. Most of the zvol_state_t is dedicated to control and
  * configuration; almost none of it is needed for data operations (that is,
  * read, write, flush) so this lock is rarely taken during general IO. It
  * should be released quickly; you should avoid sleeping or waiting while its
  * held.
  *
  * zv_suspend_lock is used to suspend IO/data operations to a zvol. The read
  * half should held for the duration of an IO operation. The write half should
  * be taken when something to wait for IO to complete and the block further IO,
  * eg for the duration of receive and rollback operations. This lock can be
  * held for long periods of time.
  *
  * Thus, the following lock ordering appies.
  * - take zvol_state_lock if necessary, to protect zvol_state_list
  * - take zv_suspend_lock if necessary, by the code path in question
  * - take zv_state_lock to protect zvol_state_t
  *
  * The minor operations are issued to spa->spa_zvol_taskq queues, that are
  * single-threaded (to preserve order of minor operations), and are executed
  * through the zvol_task_cb that dispatches the specific operations. Therefore,
  * these operations are serialized per pool. Consequently, we can be certain
  * that for a given zvol, there is only one operation at a time in progress.
  * That is why one can be sure that first, zvol_state_t for a given zvol is
  * allocated and placed on zvol_state_list, and then other minor operations for
  * this zvol are going to proceed in the order of issue.
  */
 
 #include <sys/dataset_kstats.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dir.h>
 #include <sys/zap.h>
 #include <sys/zfeature.h>
 #include <sys/zil_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/zio.h>
 #include <sys/zfs_rlock.h>
 #include <sys/spa_impl.h>
 #include <sys/zvol.h>
 #include <sys/zvol_impl.h>
 
 unsigned int zvol_inhibit_dev = 0;
 unsigned int zvol_prefetch_bytes = (128 * 1024);
 unsigned int zvol_volmode = ZFS_VOLMODE_GEOM;
 unsigned int zvol_threads = 0;
 unsigned int zvol_num_taskqs = 0;
 unsigned int zvol_request_sync = 0;
 
 struct hlist_head *zvol_htable;
 static list_t zvol_state_list;
 krwlock_t zvol_state_lock;
 extern int zfs_bclone_wait_dirty;
 zv_taskq_t zvol_taskqs;
 
 typedef enum {
 	ZVOL_ASYNC_CREATE_MINORS,
 	ZVOL_ASYNC_REMOVE_MINORS,
 	ZVOL_ASYNC_RENAME_MINORS,
 	ZVOL_ASYNC_SET_SNAPDEV,
 	ZVOL_ASYNC_SET_VOLMODE,
 	ZVOL_ASYNC_MAX
 } zvol_async_op_t;
 
 typedef struct {
 	zvol_async_op_t zt_op;
 	char zt_name1[MAXNAMELEN];
 	char zt_name2[MAXNAMELEN];
 	uint64_t zt_value;
 	uint32_t zt_total;
 	uint32_t zt_done;
 	int32_t zt_status;
 	int zt_error;
 } zvol_task_t;
 
 zv_request_task_t *
 zv_request_task_create(zv_request_t zvr)
 {
 	zv_request_task_t *task;
 	task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP);
 	taskq_init_ent(&task->ent);
 	task->zvr = zvr;
 	return (task);
 }
 
 void
 zv_request_task_free(zv_request_task_t *task)
 {
 	kmem_free(task, sizeof (*task));
 }
 
 uint64_t
 zvol_name_hash(const char *name)
 {
 	uint64_t crc = -1ULL;
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF];
 	return (crc);
 }
 
 /*
  * Find a zvol_state_t given the name and hash generated by zvol_name_hash.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 zvol_state_t *
 zvol_find_by_name_hash(const char *name, uint64_t hash, int mode)
 {
 	zvol_state_t *zv;
 	struct hlist_node *p = NULL;
 
 	rw_enter(&zvol_state_lock, RW_READER);
 	hlist_for_each(p, ZVOL_HT_HEAD(hash)) {
 		zv = hlist_entry(p, zvol_state_t, zv_hlink);
 		mutex_enter(&zv->zv_state_lock);
 		if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) {
 			/*
 			 * this is the right zvol, take the locks in the
 			 * right order
 			 */
 			if (mode != RW_NONE &&
 			    !rw_tryenter(&zv->zv_suspend_lock, mode)) {
 				mutex_exit(&zv->zv_state_lock);
 				rw_enter(&zv->zv_suspend_lock, mode);
 				mutex_enter(&zv->zv_state_lock);
 				/*
 				 * zvol cannot be renamed as we continue
 				 * to hold zvol_state_lock
 				 */
 				ASSERT(zv->zv_hash == hash &&
 				    strcmp(zv->zv_name, name) == 0);
 			}
 			rw_exit(&zvol_state_lock);
 			return (zv);
 		}
 		mutex_exit(&zv->zv_state_lock);
 	}
 	rw_exit(&zvol_state_lock);
 
 	return (NULL);
 }
 
 /*
  * Find a zvol_state_t given the name.
  * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise,
  * return (NULL) without the taking locks. The zv_suspend_lock is always taken
  * before zv_state_lock. The mode argument indicates the mode (including none)
  * for zv_suspend_lock to be taken.
  */
 static zvol_state_t *
 zvol_find_by_name(const char *name, int mode)
 {
 	return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode));
 }
 
 /*
  * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation.
  */
 void
 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	zfs_creat_t *zct = arg;
 	nvlist_t *nvprops = zct->zct_props;
 	int error;
 	uint64_t volblocksize, volsize;
 
 	VERIFY0(nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize));
 	if (nvlist_lookup_uint64(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0)
 		volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE);
 
 	/*
 	 * These properties must be removed from the list so the generic
 	 * property setting step won't apply to them.
 	 */
 	VERIFY0(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE)));
 	(void) nvlist_remove_all(nvprops,
 	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE));
 
 	error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT0(error);
 
 	error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP,
 	    DMU_OT_NONE, 0, tx);
 	ASSERT0(error);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx);
 	ASSERT0(error);
 }
 
 /*
  * ZFS_IOC_OBJSET_STATS entry point.
  */
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
 	int error;
 	dmu_object_info_t *doi;
 	uint64_t val;
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (error);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val);
 	doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP);
 	error = dmu_object_info(os, ZVOL_OBJ, doi);
 
 	if (error == 0) {
 		dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE,
 		    doi->doi_data_block_size);
 	}
 
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	return (error);
 }
 
 /*
  * Sanity check volume size.
  */
 int
 zvol_check_volsize(uint64_t volsize, uint64_t blocksize)
 {
 	if (volsize == 0)
 		return (SET_ERROR(EINVAL));
 
 	if (volsize % blocksize != 0)
 		return (SET_ERROR(EINVAL));
 
 #ifdef _ILP32
 	if (volsize - 1 > SPEC_MAXOFFSET_T)
 		return (SET_ERROR(EOVERFLOW));
 #endif
 	return (0);
 }
 
 /*
  * Ensure the zap is flushed then inform the VFS of the capacity change.
  */
 static int
 zvol_update_volsize(uint64_t volsize, objset_t *os)
 {
 	dmu_tx_t *tx;
 	int error;
 	uint64_t txg;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	dmu_tx_mark_netfree(tx);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 	txg = dmu_tx_get_txg(tx);
 
 	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	txg_wait_synced(dmu_objset_pool(os), txg);
 
 	if (error == 0)
 		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
 
 	return (error);
 }
 
 /*
  * Set ZFS_PROP_VOLSIZE set entry point.  Note that modifying the volume
  * size will result in a udev "change" event being generated.
  */
 int
 zvol_set_volsize(const char *name, uint64_t volsize)
 {
 	objset_t *os = NULL;
 	uint64_t readonly;
 	int error;
 	boolean_t owned = B_FALSE;
 
 	error = dsl_prop_get_integer(name,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
 	if (error != 0)
 		return (error);
 	if (readonly)
 		return (SET_ERROR(EROFS));
 
 	zvol_state_t *zv = zvol_find_by_name(name, RW_READER);
 
 	ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_READ_HELD(&zv->zv_suspend_lock)));
 
 	if (zv == NULL || zv->zv_objset == NULL) {
 		if (zv != NULL)
 			rw_exit(&zv->zv_suspend_lock);
 		if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE,
 		    FTAG, &os)) != 0) {
 			if (zv != NULL)
 				mutex_exit(&zv->zv_state_lock);
 			return (error);
 		}
 		owned = B_TRUE;
 		if (zv != NULL)
 			zv->zv_objset = os;
 	} else {
 		os = zv->zv_objset;
 	}
 
 	dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP);
 
 	if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) ||
 	    (error = zvol_check_volsize(volsize, doi->doi_data_block_size)))
 		goto out;
 
 	error = zvol_update_volsize(volsize, os);
 	if (error == 0 && zv != NULL) {
 		zv->zv_volsize = volsize;
 		zv->zv_changed = 1;
 	}
 out:
 	kmem_free(doi, sizeof (dmu_object_info_t));
 
 	if (owned) {
 		dmu_objset_disown(os, B_TRUE, FTAG);
 		if (zv != NULL)
 			zv->zv_objset = NULL;
 	} else {
 		rw_exit(&zv->zv_suspend_lock);
 	}
 
 	if (zv != NULL)
 		mutex_exit(&zv->zv_state_lock);
 
 	if (error == 0 && zv != NULL)
 		zvol_os_update_volsize(zv, volsize);
 
 	return (error);
 }
 
 /*
  * Update volthreading.
  */
 int
 zvol_set_volthreading(const char *name, boolean_t value)
 {
 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL)
 		return (SET_ERROR(ENOENT));
 	zv->zv_threading = value;
 	mutex_exit(&zv->zv_state_lock);
 	return (0);
 }
 
 /*
  * Update zvol ro property.
  */
 int
 zvol_set_ro(const char *name, boolean_t value)
 {
 	zvol_state_t *zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL)
 		return (-1);
 	if (value) {
 		zvol_os_set_disk_ro(zv, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
 		zvol_os_set_disk_ro(zv, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	mutex_exit(&zv->zv_state_lock);
 	return (0);
 }
 
 /*
  * Sanity check volume block size.
  */
 int
 zvol_check_volblocksize(const char *name, uint64_t volblocksize)
 {
 	/* Record sizes above 128k need the feature to be enabled */
 	if (volblocksize > SPA_OLD_MAXBLOCKSIZE) {
 		spa_t *spa;
 		int error;
 
 		if ((error = spa_open(name, &spa, FTAG)) != 0)
 			return (error);
 
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(ENOTSUP));
 		}
 
 		/*
 		 * We don't allow setting the property above 1MB,
 		 * unless the tunable has been changed.
 		 */
 		if (volblocksize > zfs_max_recordsize) {
 			spa_close(spa, FTAG);
 			return (SET_ERROR(EDOM));
 		}
 
 		spa_close(spa, FTAG);
 	}
 
 	if (volblocksize < SPA_MINBLOCKSIZE ||
 	    volblocksize > SPA_MAXBLOCKSIZE ||
 	    !ISP2(volblocksize))
 		return (SET_ERROR(EDOM));
 
 	return (0);
 }
 
 /*
  * Replay a TX_TRUNCATE ZIL transaction if asked.  TX_TRUNCATE is how we
  * implement DKIOCFREE/free-long-range.
  */
 static int
 zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_truncate_t *lr = arg2;
 	uint64_t offset, length;
 
 	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 	dmu_tx_mark_netfree(tx);
 	int error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 	} else {
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 		error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset,
 		    length);
 	}
 
 	return (error);
 }
 
 /*
  * Replay a TX_WRITE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_write_t *lr = arg2;
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
 	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	offset = lr->lr_offset;
 	length = lr->lr_length;
 
 	/* If it's a dmu_sync() block, write the whole block */
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 		if (length < blocksize) {
 			offset -= offset % blocksize;
 			length = blocksize;
 		}
 	}
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
 		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		(void) zil_replaying(zv->zv_zilog, tx);
 		dmu_tx_commit(tx);
 	}
 
 	return (error);
 }
 
 /*
  * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
  * after a system failure
  */
 static int
 zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
 {
 	zvol_state_t *zv = arg1;
 	lr_clone_range_t *lr = arg2;
 	objset_t *os = zv->zv_objset;
 	dmu_tx_t *tx;
 	int error;
 	uint64_t blksz;
 	uint64_t off;
 	uint64_t len;
 
 	ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr));
 	ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t,
 	    lr_bps[lr->lr_nbps]));
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
 	ASSERT(spa_feature_is_enabled(dmu_objset_spa(os),
 	    SPA_FEATURE_BLOCK_CLONING));
 
 	off = lr->lr_offset;
 	len = lr->lr_length;
 	blksz = lr->lr_blksz;
 
 	if ((off % blksz) != 0) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
 	if (error != 0 || !zv->zv_dn)
 		return (error);
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len, blksz);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error != 0) {
 		dmu_tx_abort(tx);
 		goto out;
 	}
 	error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len,
 	    tx, lr->lr_bps, lr->lr_nbps);
 	if (error != 0) {
 		dmu_tx_commit(tx);
 		goto out;
 	}
 
 	/*
 	 * zil_replaying() not only check if we are replaying ZIL, but also
 	 * updates the ZIL header to record replay progress.
 	 */
 	VERIFY(zil_replaying(zv->zv_zilog, tx));
 	dmu_tx_commit(tx);
 
 out:
 	dnode_rele(zv->zv_dn, zv);
 	zv->zv_dn = NULL;
 	return (error);
 }
 
 int
 zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst,
     uint64_t outoff, uint64_t len)
 {
 	zilog_t	*zilog_dst;
 	zfs_locked_range_t *inlr, *outlr;
 	objset_t *inos, *outos;
 	dmu_tx_t *tx;
 	blkptr_t *bps;
 	size_t maxblocks;
 	int error = 0;
 
 	rw_enter(&zv_dst->zv_suspend_lock, RW_READER);
 	if (zv_dst->zv_zilog == NULL) {
 		rw_exit(&zv_dst->zv_suspend_lock);
 		rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER);
 		if (zv_dst->zv_zilog == NULL) {
 			zv_dst->zv_zilog = zil_open(zv_dst->zv_objset,
 			    zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums);
 			zv_dst->zv_flags |= ZVOL_WRITTEN_TO;
 			VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags &
 			    ZIL_REPLAY_NEEDED));
 		}
 		rw_downgrade(&zv_dst->zv_suspend_lock);
 	}
 	if (zv_src != zv_dst)
 		rw_enter(&zv_src->zv_suspend_lock, RW_READER);
 
 	inos = zv_src->zv_objset;
 	outos = zv_dst->zv_objset;
 
 	/*
 	 * Sanity checks
 	 */
 	if (!spa_feature_is_enabled(dmu_objset_spa(outos),
 	    SPA_FEATURE_BLOCK_CLONING)) {
 		error = SET_ERROR(EOPNOTSUPP);
 		goto out;
 	}
 	if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 	if (inos->os_encrypted != outos->os_encrypted) {
 		error = SET_ERROR(EXDEV);
 		goto out;
 	}
 	if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 	if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) {
 		goto out;
 	}
 
 	/*
 	 * Do not read beyond boundary
 	 */
 	if (len > zv_src->zv_volsize - inoff)
 		len = zv_src->zv_volsize - inoff;
 	if (len > zv_dst->zv_volsize - outoff)
 		len = zv_dst->zv_volsize - outoff;
 	if (len == 0)
 		goto out;
 
 	/*
 	 * No overlapping if we are cloning within the same file
 	 */
 	if (zv_src == zv_dst) {
 		if (inoff < outoff + len && outoff < inoff + len) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 	}
 
 	/*
 	 * Offsets and length must be at block boundaries
 	 */
 	if ((inoff % zv_src->zv_volblocksize) != 0 ||
 	    (outoff % zv_dst->zv_volblocksize) != 0) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * Length must be multiple of block size
 	 */
 	if ((len % zv_src->zv_volblocksize) != 0) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	zilog_dst = zv_dst->zv_zilog;
 	maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) /
 	    sizeof (bps[0]);
 	bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
 	/*
 	 * Maintain predictable lock order.
 	 */
 	if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) {
 		inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
 		    RL_READER);
 		outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
 		    RL_WRITER);
 	} else {
 		outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len,
 		    RL_WRITER);
 		inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len,
 		    RL_READER);
 	}
 
 	while (len > 0) {
 		uint64_t size, last_synced_txg;
 		size_t nbps = maxblocks;
 		size = MIN(zv_src->zv_volblocksize * maxblocks, len);
 		last_synced_txg = spa_last_synced_txg(
 		    dmu_objset_spa(zv_src->zv_objset));
 		error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff,
 		    size, bps, &nbps);
 		if (error != 0) {
 			/*
 			 * If we are trying to clone a block that was created
 			 * in the current transaction group, the error will be
 			 * EAGAIN here.  Based on zfs_bclone_wait_dirty either
 			 * return a shortened range to the caller so it can
 			 * fallback, or wait for the next TXG and check again.
 			 */
 			if (error == EAGAIN && zfs_bclone_wait_dirty) {
 				txg_wait_synced(dmu_objset_pool
 				    (zv_src->zv_objset), last_synced_txg + 1);
 					continue;
 			}
 			break;
 		}
 
 		tx = dmu_tx_create(zv_dst->zv_objset);
 		dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size,
 		    zv_src->zv_volblocksize);
 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (error != 0) {
 			dmu_tx_abort(tx);
 			break;
 		}
 		error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size,
 		    tx, bps, nbps);
 		if (error != 0) {
 			dmu_tx_commit(tx);
 			break;
 		}
 		zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff,
 		    size, zv_src->zv_volblocksize, bps, nbps);
 		dmu_tx_commit(tx);
 		inoff += size;
 		outoff += size;
 		len -= size;
 	}
 	vmem_free(bps, sizeof (bps[0]) * maxblocks);
 	zfs_rangelock_exit(outlr);
 	zfs_rangelock_exit(inlr);
 	if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) {
 		error = zil_commit(zilog_dst, ZVOL_OBJ);
 	}
 out:
 	if (zv_src != zv_dst)
 		rw_exit(&zv_src->zv_suspend_lock);
 	rw_exit(&zv_dst->zv_suspend_lock);
 	return (error);
 }
 
 /*
  * Handles TX_CLONE_RANGE transactions.
  */
 void
 zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off,
     uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps)
 {
 	itx_t *itx;
 	lr_clone_range_t *lr;
 	uint64_t partlen, max_log_data;
 	size_t partnbps;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
 
 	while (nbps > 0) {
 		partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
 		partlen = partnbps * blksz;
 		ASSERT3U(partlen, <, len + blksz);
 		partlen = MIN(partlen, len);
 
 		itx = zil_itx_create(txtype,
 		    sizeof (*lr) + sizeof (bps[0]) * partnbps);
 		lr = (lr_clone_range_t *)&itx->itx_lr;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = partlen;
 		lr->lr_blksz = blksz;
 		lr->lr_nbps = partnbps;
 		memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
 
 		zil_itx_assign(zilog, itx, tx);
 
 		bps += partnbps;
 		ASSERT3U(nbps, >=, partnbps);
 		nbps -= partnbps;
 		off += partlen;
 		ASSERT3U(len, >=, partlen);
 		len -= partlen;
 	}
 }
 
 static int
 zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
 {
 	(void) arg1, (void) arg2, (void) byteswap;
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*
  * Callback vectors for replaying records.
  * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
  */
 zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* no such transaction type */
 	zvol_replay_err,	/* TX_CREATE */
 	zvol_replay_err,	/* TX_MKDIR */
 	zvol_replay_err,	/* TX_MKXATTR */
 	zvol_replay_err,	/* TX_SYMLINK */
 	zvol_replay_err,	/* TX_REMOVE */
 	zvol_replay_err,	/* TX_RMDIR */
 	zvol_replay_err,	/* TX_LINK */
 	zvol_replay_err,	/* TX_RENAME */
 	zvol_replay_write,	/* TX_WRITE */
 	zvol_replay_truncate,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL_V0 */
 	zvol_replay_err,	/* TX_ACL */
 	zvol_replay_err,	/* TX_CREATE_ACL */
 	zvol_replay_err,	/* TX_CREATE_ATTR */
 	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL */
 	zvol_replay_err,	/* TX_MKDIR_ATTR */
 	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
 	zvol_replay_err,	/* TX_WRITE2 */
 	zvol_replay_err,	/* TX_SETSAXATTR */
 	zvol_replay_err,	/* TX_RENAME_EXCHANGE */
 	zvol_replay_err,	/* TX_RENAME_WHITEOUT */
 	zvol_replay_clone_range,	/* TX_CLONE_RANGE */
 };
 
 /*
  * zvol_log_write() handles TX_WRITE transactions.
  */
 void
 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset,
     uint64_t size, boolean_t commit)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
 	itx_wr_state_t write_state;
 	uint64_t log_size = 0;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit);
 
 	while (size) {
 		itx_t *itx;
 		lr_write_t *lr;
 		itx_wr_state_t wr_state = write_state;
 		ssize_t len = size;
 
 		if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog))
 			wr_state = WR_NEED_COPY;
 		else if (wr_state == WR_INDIRECT)
 			len = MIN(blocksize - P2PHASE(offset, blocksize), size);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
 		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (wr_state == WR_COPIED &&
 		    dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1,
 		    DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) {
 			zil_itx_destroy(itx, 0);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			wr_state = WR_NEED_COPY;
 		}
 
 		log_size += itx->itx_size;
 		if (wr_state == WR_NEED_COPY)
 			log_size += len;
 
 		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = offset;
 		lr->lr_length = len;
 		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
 		itx->itx_private = zv;
 
 		zil_itx_assign(zilog, itx, tx);
 
 		offset += len;
 		size -= len;
 	}
 
 	dsl_pool_wrlog_count(zilog->zl_dmu_pool, log_size, tx->tx_txg);
 }
 
 /*
  * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
  */
 void
 zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len)
 {
 	itx_t *itx;
 	lr_truncate_t *lr;
 	zilog_t *zilog = zv->zv_zilog;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
 	lr = (lr_truncate_t *)&itx->itx_lr;
 	lr->lr_foid = ZVOL_OBJ;
 	lr->lr_offset = off;
 	lr->lr_length = len;
 
 	zil_itx_assign(zilog, itx, tx);
 }
 
 
 static void
 zvol_get_done(zgd_t *zgd, int error)
 {
 	(void) error;
 	if (zgd->zgd_db)
 		dmu_buf_rele(zgd->zgd_db, zgd);
 
 	zfs_rangelock_exit(zgd->zgd_lr);
 
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
 int
 zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf,
     struct lwb *lwb, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	uint64_t offset = lr->lr_offset;
 	uint64_t size = lr->lr_length;
 	dmu_buf_t *db;
 	zgd_t *zgd;
 	int error;
 
 	ASSERT3P(lwb, !=, NULL);
 	ASSERT3U(size, !=, 0);
 
 	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
 	zgd->zgd_lwb = lwb;
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
 	 * log record (immediate); for large writes it's cheaper to
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
 		error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf,
 		    DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING);
 	} else { /* indirect write */
 		ASSERT3P(zio, !=, NULL);
 		/*
 		 * Have to lock the whole block to ensure when it's written out
 		 * and its checksum is being calculated that no one can change
 		 * the data. Contrarily to zfs_get_data we need not re-check
 		 * blocksize after we get the lock because it cannot be changed.
 		 */
 		size = zv->zv_volblocksize;
 		offset = P2ALIGN_TYPED(offset, size, uint64_t);
 		zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset,
 		    size, RL_READER);
 		error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd,
 		    &db);
 		if (error == 0) {
 			blkptr_t *bp = &lr->lr_blkptr;
 
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
 			ASSERT(db != NULL);
 			ASSERT(db->db_offset == offset);
 			ASSERT(db->db_size == size);
 
 			error = dmu_sync(zio, lr->lr_common.lrc_txg,
 			    zvol_get_done, zgd);
 
 			if (error == 0)
 				return (0);
 		}
 	}
 
 	zvol_get_done(zgd, error);
 
 	return (error);
 }
 
 /*
  * The zvol_state_t's are inserted into zvol_state_list and zvol_htable.
  */
 
 void
 zvol_insert(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 	list_insert_head(&zvol_state_list, zv);
 	hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash));
 }
 
 /*
  * Simply remove the zvol from to list of zvols.
  */
 static void
 zvol_remove(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zvol_state_lock));
 	list_remove(&zvol_state_list, zv);
 	hlist_del(&zv->zv_hlink);
 }
 
 /*
  * Setup zv after we just own the zv->objset
  */
 static int
 zvol_setup_zv(zvol_state_t *zv)
 {
 	uint64_t volsize;
 	int error;
 	uint64_t ro;
 	objset_t *os = zv->zv_objset;
 
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	zv->zv_zilog = NULL;
 	zv->zv_flags &= ~ZVOL_WRITTEN_TO;
 
 	error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL);
 	if (error)
 		return (error);
 
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
 	if (error)
 		return (error);
 
 	error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn);
 	if (error)
 		return (error);
 
 	zvol_os_set_capacity(zv, volsize >> 9);
 	zv->zv_volsize = volsize;
 
 	if (ro || dmu_objset_is_snapshot(os) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		zvol_os_set_disk_ro(zv, 1);
 		zv->zv_flags |= ZVOL_RDONLY;
 	} else {
 		zvol_os_set_disk_ro(zv, 0);
 		zv->zv_flags &= ~ZVOL_RDONLY;
 	}
 	return (0);
 }
 
 /*
  * Shutdown every zv_objset related stuff except zv_objset itself.
  * The is the reverse of zvol_setup_zv.
  */
 static void
 zvol_shutdown_zv(zvol_state_t *zv)
 {
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock) &&
 	    RW_LOCK_HELD(&zv->zv_suspend_lock));
 
 	if (zv->zv_flags & ZVOL_WRITTEN_TO) {
 		ASSERT(zv->zv_zilog != NULL);
 		zil_close(zv->zv_zilog);
 	}
 
 	zv->zv_zilog = NULL;
 
 	dnode_rele(zv->zv_dn, zv);
 	zv->zv_dn = NULL;
 
 	/*
 	 * Evict cached data. We must write out any dirty data before
 	 * disowning the dataset.
 	 */
 	if (zv->zv_flags & ZVOL_WRITTEN_TO)
 		txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 	dmu_objset_evict_dbufs(zv->zv_objset);
 }
 
 /*
  * return the proper tag for rollback and recv
  */
 void *
 zvol_tag(zvol_state_t *zv)
 {
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 	return (zv->zv_open_count > 0 ? zv : NULL);
 }
 
 /*
  * Suspend the zvol for recv and rollback.
  */
-zvol_state_t *
-zvol_suspend(const char *name)
+int
+zvol_suspend(const char *name, zvol_state_t **zvp)
 {
 	zvol_state_t *zv;
 
 	zv = zvol_find_by_name(name, RW_WRITER);
 
 	if (zv == NULL)
-		return (NULL);
+		return (SET_ERROR(ENOENT));
 
 	/* block all I/O, release in zvol_resume. */
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
+	/*
+	 * If it's being removed, unlock and return error. It doesn't make any
+	 * sense to try to suspend a zvol being removed, but being here also
+	 * means that zvol_remove_minors_impl() is about to call zvol_remove()
+	 * and then destroy the zvol_state_t, so returning a pointer to it for
+	 * the caller to mess with would be a disaster anyway.
+	 */
+	if (zv->zv_flags & ZVOL_REMOVING) {
+		mutex_exit(&zv->zv_state_lock);
+		rw_exit(&zv->zv_suspend_lock);
+		/* NB: Returning EIO here to match zfsvfs_teardown() */
+		return (SET_ERROR(EIO));
+	}
+
 	atomic_inc(&zv->zv_suspend_ref);
 
 	if (zv->zv_open_count > 0)
 		zvol_shutdown_zv(zv);
 
 	/*
 	 * do not hold zv_state_lock across suspend/resume to
 	 * avoid locking up zvol lookups
 	 */
 	mutex_exit(&zv->zv_state_lock);
 
 	/* zv_suspend_lock is released in zvol_resume() */
-	return (zv);
+	*zvp = zv;
+	return (0);
 }
 
 int
 zvol_resume(zvol_state_t *zv)
 {
 	int error = 0;
 
 	ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock));
 
 	mutex_enter(&zv->zv_state_lock);
 
 	if (zv->zv_open_count > 0) {
 		VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset));
 		VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv);
 		VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset));
 		dmu_objset_rele(zv->zv_objset, zv);
 
 		error = zvol_setup_zv(zv);
 	}
 
 	mutex_exit(&zv->zv_state_lock);
 
 	rw_exit(&zv->zv_suspend_lock);
 	/*
 	 * We need this because we don't hold zvol_state_lock while releasing
 	 * zv_suspend_lock. zvol_remove_minors_impl thus cannot check
 	 * zv_suspend_lock to determine it is safe to free because rwlock is
 	 * not inherent atomic.
 	 */
 	atomic_dec(&zv->zv_suspend_ref);
 
 	if (zv->zv_flags & ZVOL_REMOVING)
 		cv_broadcast(&zv->zv_removing_cv);
 
 	return (error);
 }
 
 int
 zvol_first_open(zvol_state_t *zv, boolean_t readonly)
 {
 	objset_t *os;
 	int error;
 
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 	ASSERT(mutex_owned(&spa_namespace_lock));
 
 	boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL));
 	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os);
 	if (error)
 		return (error);
 
 	zv->zv_objset = os;
 
 	error = zvol_setup_zv(zv);
 	if (error) {
 		dmu_objset_disown(os, 1, zv);
 		zv->zv_objset = NULL;
 	}
 
 	return (error);
 }
 
 void
 zvol_last_close(zvol_state_t *zv)
 {
 	ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
 	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 
 	if (zv->zv_flags & ZVOL_REMOVING)
 		cv_broadcast(&zv->zv_removing_cv);
 
 	zvol_shutdown_zv(zv);
 
 	dmu_objset_disown(zv->zv_objset, 1, zv);
 	zv->zv_objset = NULL;
 }
 
 typedef struct minors_job {
 	list_t *list;
 	list_node_t link;
 	/* input */
 	char *name;
 	/* output */
 	int error;
 } minors_job_t;
 
 /*
  * Prefetch zvol dnodes for the minors_job
  */
 static void
 zvol_prefetch_minors_impl(void *arg)
 {
 	minors_job_t *job = arg;
 	char *dsname = job->name;
 	objset_t *os = NULL;
 
 	job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE,
 	    FTAG, &os);
 	if (job->error == 0) {
 		dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ);
 		dmu_objset_disown(os, B_TRUE, FTAG);
 	}
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_snap_minor_cb(const char *dsname, void *arg)
 {
 	minors_job_t *j = arg;
 	list_t *minors_list = j->list;
 	const char *name = j->name;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	/* skip the designated dataset */
 	if (name && strcmp(dsname, name) == 0)
 		return (0);
 
 	/* at this point, the dsname should name a snapshot */
 	if (strchr(dsname, '@') == 0) {
 		dprintf("zvol_create_snap_minor_cb(): "
 		    "%s is not a snapshot name\n", dsname);
 	} else {
 		minors_job_t *job;
 		char *n = kmem_strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 	}
 
 	return (0);
 }
 
 /*
  * If spa_keystore_load_wkey() is called for an encrypted zvol,
  * we need to look for any clones also using the key. This function
  * is "best effort" - so we just skip over it if there are failures.
  */
 static void
 zvol_add_clones(const char *dsname, list_t *minors_list)
 {
 	/* Also check if it has clones */
 	dsl_dir_t *dd = NULL;
 	dsl_pool_t *dp = NULL;
 
 	if (dsl_pool_hold(dsname, FTAG, &dp) != 0)
 		return;
 
 	if (!spa_feature_is_enabled(dp->dp_spa,
 	    SPA_FEATURE_ENCRYPTION))
 		goto out;
 
 	if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0)
 		goto out;
 
 	if (dsl_dir_phys(dd)->dd_clones == 0)
 		goto out;
 
 	zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
 	zap_attribute_t *za = zap_attribute_alloc();
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 
 	for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones);
 	    zap_cursor_retrieve(zc, za) == 0;
 	    zap_cursor_advance(zc)) {
 		dsl_dataset_t *clone;
 		minors_job_t *job;
 
 		if (dsl_dataset_hold_obj(dd->dd_pool,
 		    za->za_first_integer, FTAG, &clone) == 0) {
 
 			char name[ZFS_MAX_DATASET_NAME_LEN];
 			dsl_dataset_name(clone, name);
 
 			char *n = kmem_strdup(name);
 			job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 			job->name = n;
 			job->list = minors_list;
 			job->error = 0;
 			list_insert_tail(minors_list, job);
 
 			dsl_dataset_rele(clone, FTAG);
 		}
 	}
 	zap_cursor_fini(zc);
 	zap_attribute_free(za);
 	kmem_free(zc, sizeof (zap_cursor_t));
 
 out:
 	if (dd != NULL)
 		dsl_dir_rele(dd, FTAG);
 	dsl_pool_rele(dp, FTAG);
 }
 
 /*
  * Mask errors to continue dmu_objset_find() traversal
  */
 static int
 zvol_create_minors_cb(const char *dsname, void *arg)
 {
 	uint64_t snapdev;
 	int error;
 	list_t *minors_list = arg;
 
 	ASSERT0(MUTEX_HELD(&spa_namespace_lock));
 
 	error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL);
 	if (error)
 		return (0);
 
 	/*
 	 * Given the name and the 'snapdev' property, create device minor nodes
 	 * with the linkages to zvols/snapshots as needed.
 	 * If the name represents a zvol, create a minor node for the zvol, then
 	 * check if its snapshots are 'visible', and if so, iterate over the
 	 * snapshots and create device minor nodes for those.
 	 */
 	if (strchr(dsname, '@') == 0) {
 		minors_job_t *job;
 		char *n = kmem_strdup(dsname);
 		if (n == NULL)
 			return (0);
 
 		job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP);
 		job->name = n;
 		job->list = minors_list;
 		job->error = 0;
 		list_insert_tail(minors_list, job);
 		/* don't care if dispatch fails, because job->error is 0 */
 		taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job,
 		    TQ_SLEEP);
 
 		zvol_add_clones(dsname, minors_list);
 
 		if (snapdev == ZFS_SNAPDEV_VISIBLE) {
 			/*
 			 * traverse snapshots only, do not traverse children,
 			 * and skip the 'dsname'
 			 */
 			(void) dmu_objset_find(dsname,
 			    zvol_create_snap_minor_cb, (void *)job,
 			    DS_FIND_SNAPSHOTS);
 		}
 	} else {
 		dprintf("zvol_create_minors_cb(): %s is not a zvol name\n",
 		    dsname);
 	}
 
 	return (0);
 }
 
 static void
 zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done,
     int error)
 {
 
 	task->zt_total += total;
 	task->zt_done += done;
 	if (task->zt_total != task->zt_done) {
 		task->zt_status = -1;
 		if (error)
 			task->zt_error = error;
 	}
 }
 
 static void
 zvol_task_report_status(zvol_task_t *task)
 {
 #ifdef ZFS_DEBUG
 	static const char *const msg[] = {
 		"create",
 		"remove",
 		"rename",
 		"set snapdev",
 		"set volmode",
 		"unknown",
 	};
 
 	if (task->zt_status == 0)
 		return;
 
 	zvol_async_op_t op = MIN(task->zt_op, ZVOL_ASYNC_MAX);
 	if (task->zt_error) {
 		dprintf("The %s minors zvol task was not ok, last error %d\n",
 		    msg[op], task->zt_error);
 	} else {
 		dprintf("The %s minors zvol task was not ok\n", msg[op]);
 	}
 #else
 	(void) task;
 #endif
 }
 
 /*
  * Create minors for the specified dataset, including children and snapshots.
  * Pay attention to the 'snapdev' property and iterate over the snapshots
  * only if they are 'visible'. This approach allows one to assure that the
  * snapshot metadata is read from disk only if it is needed.
  *
  * The name can represent a dataset to be recursively scanned for zvols and
  * their snapshots, or a single zvol snapshot. If the name represents a
  * dataset, the scan is performed in two nested stages:
  * - scan the dataset for zvols, and
  * - for each zvol, create a minor node, then check if the zvol's snapshots
  *   are 'visible', and only then iterate over the snapshots if needed
  *
  * If the name represents a snapshot, a check is performed if the snapshot is
  * 'visible' (which also verifies that the parent is a zvol), and if so,
  * a minor node for that snapshot is created.
  */
 static void
 zvol_create_minors_impl(zvol_task_t *task)
 {
 	const char *name = task->zt_name1;
 	list_t minors_list;
 	minors_job_t *job;
 	uint64_t snapdev;
 	int total = 0, done = 0, last_error, error;
 
 	/*
 	 * Note: the dsl_pool_config_lock must not be held.
 	 * Minor node creation needs to obtain the zvol_state_lock.
 	 * zvol_open() obtains the zvol_state_lock and then the dsl pool
 	 * config lock.  Therefore, we can't have the config lock now if
 	 * we are going to wait for the zvol_state_lock, because it
 	 * would be a lock order inversion which could lead to deadlock.
 	 */
 
 	if (zvol_inhibit_dev) {
 		return;
 	}
 
 	/*
 	 * This is the list for prefetch jobs. Whenever we found a match
 	 * during dmu_objset_find, we insert a minors_job to the list and do
 	 * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need
 	 * any lock because all list operation is done on the current thread.
 	 *
 	 * We will use this list to do zvol_os_create_minor after prefetch
 	 * so we don't have to traverse using dmu_objset_find again.
 	 */
 	list_create(&minors_list, sizeof (minors_job_t),
 	    offsetof(minors_job_t, link));
 
 
 	if (strchr(name, '@') != NULL) {
 		error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL);
 		if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) {
 			error = zvol_os_create_minor(name);
 			if (error == 0) {
 				done++;
 			} else {
 				last_error = error;
 			}
 			total++;
 		}
 	} else {
 		fstrans_cookie_t cookie = spl_fstrans_mark();
 		(void) dmu_objset_find(name, zvol_create_minors_cb,
 		    &minors_list, DS_FIND_CHILDREN);
 		spl_fstrans_unmark(cookie);
 	}
 
 	taskq_wait_outstanding(system_taskq, 0);
 
 	/*
 	 * Prefetch is completed, we can do zvol_os_create_minor
 	 * sequentially.
 	 */
 	while ((job = list_remove_head(&minors_list)) != NULL) {
 		if (!job->error) {
 			error = zvol_os_create_minor(job->name);
 			if (error == 0) {
 				done++;
 			} else {
 				last_error = error;
 			}
 		} else if (job->error == EINVAL) {
 			/*
 			 * The objset, with the name requested by current job
 			 * exist, but have the type different from zvol.
 			 * Just ignore this sort of errors.
 			 */
 			done++;
 		} else {
 			last_error = job->error;
 		}
 		total++;
 		kmem_strfree(job->name);
 		kmem_free(job, sizeof (minors_job_t));
 	}
 
 	list_destroy(&minors_list);
 	zvol_task_update_status(task, total, done, last_error);
 }
 
 /*
  * Remove minors for specified dataset and, optionally, its children and
  * snapshots.
  */
 static void
 zvol_remove_minors_impl(zvol_task_t *task)
 {
 	zvol_state_t *zv, *zv_next;
 	const char *name = task ? task->zt_name1 : NULL;
 	int namelen = ((name) ? strlen(name) : 0);
 	boolean_t children = task ? !!task->zt_value : B_TRUE;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	/*
 	 * We collect up zvols that we want to remove on a separate list, so
 	 * that we don't have to hold zvol_state_lock for the whole time.
 	 *
 	 * We can't remove them from the global lists until we're completely
 	 * done with them, because that would make them appear to ZFS-side ops
 	 * that they don't exist, and the name might be reused, which can't be
 	 * good.
 	 */
 	list_t remove_list;
 	list_create(&remove_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_remove_node));
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 		if (zv->zv_flags & ZVOL_REMOVING) {
 			/* Another thread is handling shutdown, skip it. */
 			mutex_exit(&zv->zv_state_lock);
 			continue;
 		}
 
 		/*
 		 * This zvol should be removed if:
 		 * - no name was offered (ie removing all at shutdown); or
 		 * - name matches exactly; or
 		 * - we were asked to remove children, and
 		 *   - the start of the name matches, and
 		 *   - there is a '/' immediately after the matched name; or
 		 *   - there is a '@' immediately after the matched name
 		 */
 		if (name == NULL || strcmp(zv->zv_name, name) == 0 ||
 		    (children && strncmp(zv->zv_name, name, namelen) == 0 &&
 		    (zv->zv_name[namelen] == '/' ||
 		    zv->zv_name[namelen] == '@'))) {
 
 			/*
 			 * Matched, so mark it removal. We want to take the
 			 * write half of the suspend lock to make sure that
 			 * the zvol is not suspended, and give any data ops
 			 * chance to finish.
 			 */
 			mutex_exit(&zv->zv_state_lock);
 			rw_enter(&zv->zv_suspend_lock, RW_WRITER);
 			mutex_enter(&zv->zv_state_lock);
 
 			if (zv->zv_flags & ZVOL_REMOVING) {
 				/* Another thread has taken it, let them. */
 				mutex_exit(&zv->zv_state_lock);
 				rw_exit(&zv->zv_suspend_lock);
 				continue;
 			}
 
 			/*
 			 * Mark it and unlock. New entries will see the flag
 			 * and return ENXIO.
 			 */
 			zv->zv_flags |= ZVOL_REMOVING;
 			mutex_exit(&zv->zv_state_lock);
 			rw_exit(&zv->zv_suspend_lock);
 
 			/* Put it on the list for the next stage. */
 			list_insert_head(&remove_list, zv);
 		} else
 			mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 
 	/* Didn't match any, nothing to do! */
 	if (list_is_empty(&remove_list)) {
 		if (task)
 			task->zt_error = SET_ERROR(ENOENT);
 		return;
 	}
 
 	/* Actually shut them all down. */
 	for (zv = list_head(&remove_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&remove_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 
 		/*
 		 * Still open or suspended, just wait. This can happen if, for
 		 * example, we managed to acquire zv_state_lock in the moments
 		 * where zvol_open() or zvol_release() are trading locks to
 		 * call zvol_first_open() or zvol_last_close().
 		 */
 		while (zv->zv_open_count > 0 ||
 		    atomic_read(&zv->zv_suspend_ref))
 			cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock);
 
 		/*
 		 * No users, shut down the OS side. This may not remove the
 		 * minor from view immediately, depending on the kernel
 		 * specifics, but it will ensure that it is unusable and that
 		 * this zvol_state_t can never again be reached from an OS-side
 		 * operation.
 		 */
 		zvol_os_remove_minor(zv);
 		mutex_exit(&zv->zv_state_lock);
 
 		/* Remove it from the name lookup lists */
 		rw_enter(&zvol_state_lock, RW_WRITER);
 		zvol_remove(zv);
 		rw_exit(&zvol_state_lock);
 	}
 
 	/*
 	 * Our own references on remove_list is the last one, free them and
 	 * we're done.
 	 */
 	while ((zv = list_remove_head(&remove_list)) != NULL)
 		zvol_os_free(zv);
 
 	list_destroy(&remove_list);
 }
 
 /* Remove minor for this specific volume only */
 static int
 zvol_remove_minor_impl(const char *name)
 {
 	if (zvol_inhibit_dev)
 		return (0);
 
 	zvol_task_t task;
 	memset(&task, 0, sizeof (zvol_task_t));
 	strlcpy(task.zt_name1, name, sizeof (task.zt_name1));
 	task.zt_value = B_FALSE;
 
 	zvol_remove_minors_impl(&task);
 
 	return (task.zt_error);
 }
 
 /*
  * Rename minors for specified dataset including children and snapshots.
  */
 static void
 zvol_rename_minors_impl(zvol_task_t *task)
 {
 	zvol_state_t *zv, *zv_next;
 	const char *oldname = task->zt_name1;
 	const char *newname = task->zt_name2;
 	int total = 0, done = 0, last_error, error, oldnamelen;
 
 	if (zvol_inhibit_dev)
 		return;
 
 	oldnamelen = strlen(oldname);
 
 	rw_enter(&zvol_state_lock, RW_READER);
 
 	for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) {
 		zv_next = list_next(&zvol_state_list, zv);
 
 		mutex_enter(&zv->zv_state_lock);
 
 		if (strcmp(zv->zv_name, oldname) == 0) {
 			error = zvol_os_rename_minor(zv, newname);
 		} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
 		    (zv->zv_name[oldnamelen] == '/' ||
 		    zv->zv_name[oldnamelen] == '@')) {
 			char *name = kmem_asprintf("%s%c%s", newname,
 			    zv->zv_name[oldnamelen],
 			    zv->zv_name + oldnamelen + 1);
 			error = zvol_os_rename_minor(zv, name);
 			kmem_strfree(name);
 		}
 		if (error) {
 			last_error = error;
 		} else {
 			done++;
 		}
 		total++;
 		mutex_exit(&zv->zv_state_lock);
 	}
 
 	rw_exit(&zvol_state_lock);
 	zvol_task_update_status(task, total, done, last_error);
 }
 
 typedef struct zvol_snapdev_cb_arg {
 	zvol_task_t *task;
 	uint64_t snapdev;
 } zvol_snapdev_cb_arg_t;
 
 static int
 zvol_set_snapdev_cb(const char *dsname, void *param)
 {
 	zvol_snapdev_cb_arg_t *arg = param;
 	int error = 0;
 
 	if (strchr(dsname, '@') == NULL)
 		return (0);
 
 	switch (arg->snapdev) {
 		case ZFS_SNAPDEV_VISIBLE:
 			error = zvol_os_create_minor(dsname);
 			break;
 		case ZFS_SNAPDEV_HIDDEN:
 			error = zvol_remove_minor_impl(dsname);
 			break;
 	}
 
 	zvol_task_update_status(arg->task, 1, error == 0, error);
 	return (0);
 }
 
 static void
 zvol_set_snapdev_impl(zvol_task_t *task)
 {
 	const char *name = task->zt_name1;
 	uint64_t snapdev = task->zt_value;
 
 	zvol_snapdev_cb_arg_t arg = {task, snapdev};
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	/*
 	 * The zvol_set_snapdev_sync() sets snapdev appropriately
 	 * in the dataset hierarchy. Here, we only scan snapshots.
 	 */
 	dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS);
 	spl_fstrans_unmark(cookie);
 }
 
 static void
 zvol_set_volmode_impl(zvol_task_t *task)
 {
 	const char *name = task->zt_name1;
 	uint64_t volmode = task->zt_value;
 	fstrans_cookie_t cookie;
 	uint64_t old_volmode;
 	zvol_state_t *zv;
 	int error;
 
 	if (strchr(name, '@') != NULL)
 		return;
 
 	/*
 	 * It's unfortunate we need to remove minors before we create new ones:
 	 * this is necessary because our backing gendisk (zvol_state->zv_disk)
 	 * could be different when we set, for instance, volmode from "geom"
 	 * to "dev" (or vice versa).
 	 */
 	zv = zvol_find_by_name(name, RW_NONE);
 	if (zv == NULL && volmode == ZFS_VOLMODE_NONE)
 		return;
 	if (zv != NULL) {
 		old_volmode = zv->zv_volmode;
 		mutex_exit(&zv->zv_state_lock);
 		if (old_volmode == volmode)
 			return;
 		zvol_wait_close(zv);
 	}
 	cookie = spl_fstrans_mark();
 	switch (volmode) {
 		case ZFS_VOLMODE_NONE:
 			error = zvol_remove_minor_impl(name);
 			break;
 		case ZFS_VOLMODE_GEOM:
 		case ZFS_VOLMODE_DEV:
 			error = zvol_remove_minor_impl(name);
 			/*
 			 * The remove minor function call above, might be not
 			 * needed, if volmode was switched from 'none' value.
 			 * Ignore error in this case.
 			 */
 			if (error == ENOENT)
 				error = 0;
 			else if (error)
 				break;
 			error = zvol_os_create_minor(name);
 			break;
 		case ZFS_VOLMODE_DEFAULT:
 			error = zvol_remove_minor_impl(name);
 			if (zvol_volmode == ZFS_VOLMODE_NONE)
 				break;
 			else /* if zvol_volmode is invalid defaults to "geom" */
 				error = zvol_os_create_minor(name);
 			break;
 	}
 	zvol_task_update_status(task, 1, error == 0, error);
 	spl_fstrans_unmark(cookie);
 }
 
 /*
  * The worker thread function performed asynchronously.
  */
 static void
 zvol_task_cb(void *arg)
 {
 	zvol_task_t *task = arg;
 
 	switch (task->zt_op) {
 	case ZVOL_ASYNC_CREATE_MINORS:
 		zvol_create_minors_impl(task);
 		break;
 	case ZVOL_ASYNC_REMOVE_MINORS:
 		zvol_remove_minors_impl(task);
 		break;
 	case ZVOL_ASYNC_RENAME_MINORS:
 		zvol_rename_minors_impl(task);
 		break;
 	case ZVOL_ASYNC_SET_SNAPDEV:
 		zvol_set_snapdev_impl(task);
 		break;
 	case ZVOL_ASYNC_SET_VOLMODE:
 		zvol_set_volmode_impl(task);
 		break;
 	default:
 		VERIFY(0);
 		break;
 	}
 
 	zvol_task_report_status(task);
 	kmem_free(task, sizeof (zvol_task_t));
 }
 
 typedef struct zvol_set_prop_int_arg {
 	const char *zsda_name;
 	uint64_t zsda_value;
 	zprop_source_t zsda_source;
 	zfs_prop_t zsda_prop;
 } zvol_set_prop_int_arg_t;
 
 /*
  * Sanity check the dataset for safe use by the sync task.  No additional
  * conditions are imposed.
  */
 static int
 zvol_set_common_check(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	int error;
 
 	error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL);
 	if (error != 0)
 		return (error);
 
 	dsl_dir_rele(dd, FTAG);
 
 	return (error);
 }
 
 static int
 zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	zvol_task_t *task;
 	uint64_t prop;
 
 	const char *prop_name = zfs_prop_to_name(zsda->zsda_prop);
 	dsl_dataset_name(ds, dsname);
 
 	if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0)
 		return (0);
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	if (zsda->zsda_prop == ZFS_PROP_VOLMODE) {
 		task->zt_op = ZVOL_ASYNC_SET_VOLMODE;
 	} else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) {
 		task->zt_op = ZVOL_ASYNC_SET_SNAPDEV;
 	} else {
 		kmem_free(task, sizeof (zvol_task_t));
 		return (0);
 	}
 	task->zt_value = prop;
 	strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1));
 	(void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb,
 	    task, TQ_SLEEP);
 	return (0);
 }
 
 /*
  * Traverse all child datasets and apply the property appropriately.
  * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel
  * dataset and read the effective "property" on every child in the callback
  * function: this is because the value is not guaranteed to be the same in the
  * whole dataset hierarchy.
  */
 static void
 zvol_set_common_sync(void *arg, dmu_tx_t *tx)
 {
 	zvol_set_prop_int_arg_t *zsda = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	int error;
 
 	VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL));
 
 	error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds);
 	if (error == 0) {
 		dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop),
 		    zsda->zsda_source, sizeof (zsda->zsda_value), 1,
 		    &zsda->zsda_value, tx);
 		dsl_dataset_rele(ds, FTAG);
 	}
 
 	dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb,
 	    zsda, DS_FIND_CHILDREN);
 
 	dsl_dir_rele(dd, FTAG);
 }
 
 int
 zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source,
     uint64_t val)
 {
 	zvol_set_prop_int_arg_t zsda;
 
 	zsda.zsda_name = ddname;
 	zsda.zsda_source = source;
 	zsda.zsda_value = val;
 	zsda.zsda_prop = prop;
 
 	return (dsl_sync_task(ddname, zvol_set_common_check,
 	    zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE));
 }
 
 void
 zvol_create_minors(const char *name)
 {
 	spa_t *spa;
 	zvol_task_t *task;
 	taskqid_t id;
 
 	if (spa_open(name, &spa, FTAG) != 0)
 		return;
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->zt_op = ZVOL_ASYNC_CREATE_MINORS;
 	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if (id != TASKQID_INVALID)
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 
 	spa_close(spa, FTAG);
 }
 
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->zt_op = ZVOL_ASYNC_REMOVE_MINORS;
 	strlcpy(task->zt_name1, name, sizeof (task->zt_name1));
 	task->zt_value = B_TRUE;
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 void
 zvol_rename_minors(spa_t *spa, const char *name1, const char *name2,
     boolean_t async)
 {
 	zvol_task_t *task;
 	taskqid_t id;
 
 	task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP);
 	task->zt_op = ZVOL_ASYNC_RENAME_MINORS;
 	strlcpy(task->zt_name1, name1, sizeof (task->zt_name1));
 	strlcpy(task->zt_name2, name2, sizeof (task->zt_name2));
 	id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP);
 	if ((async == B_FALSE) && (id != TASKQID_INVALID))
 		taskq_wait_id(spa->spa_zvol_taskq, id);
 }
 
 boolean_t
 zvol_is_zvol(const char *name)
 {
 
 	return (zvol_os_is_zvol(name));
 }
 
 int
 zvol_init_impl(void)
 {
 	int i;
 
 	/*
 	 * zvol_threads is the module param the user passes in.
 	 *
 	 * zvol_actual_threads is what we use internally, since the user can
 	 * pass zvol_thread = 0 to mean "use all the CPUs" (the default).
 	 */
 	static unsigned int zvol_actual_threads;
 
 	if (zvol_threads == 0) {
 		/*
 		 * See dde9380a1 for why 32 was chosen here.  This should
 		 * probably be refined to be some multiple of the number
 		 * of CPUs.
 		 */
 		zvol_actual_threads = MAX(max_ncpus, 32);
 	} else {
 		zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024);
 	}
 
 	/*
 	 * Use at least 32 zvol_threads but for many core system,
 	 * prefer 6 threads per taskq, but no more taskqs
 	 * than threads in them on large systems.
 	 *
 	 *                 taskq   total
 	 * cpus    taskqs  threads threads
 	 * ------- ------- ------- -------
 	 * 1       1       32       32
 	 * 2       1       32       32
 	 * 4       1       32       32
 	 * 8       2       16       32
 	 * 16      3       11       33
 	 * 32      5       7        35
 	 * 64      8       8        64
 	 * 128     11      12       132
 	 * 256     16      16       256
 	 */
 	zv_taskq_t *ztqs = &zvol_taskqs;
 	int num_tqs = MIN(max_ncpus, zvol_num_taskqs);
 	if (num_tqs == 0) {
 		num_tqs = 1 + max_ncpus / 6;
 		while (num_tqs * num_tqs > zvol_actual_threads)
 			num_tqs--;
 	}
 
 	int per_tq_thread = zvol_actual_threads / num_tqs;
 	if (per_tq_thread * num_tqs < zvol_actual_threads)
 		per_tq_thread++;
 
 	ztqs->tqs_cnt = num_tqs;
 	ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP);
 
 	for (uint_t i = 0; i < num_tqs; i++) {
 		char name[32];
 		(void) snprintf(name, sizeof (name), "%s_tq-%u",
 		    ZVOL_DRIVER, i);
 		ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread,
 		    maxclsyspri, per_tq_thread, INT_MAX,
 		    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 		if (ztqs->tqs_taskq[i] == NULL) {
 			for (int j = i - 1; j >= 0; j--)
 				taskq_destroy(ztqs->tqs_taskq[j]);
 			kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
 			    sizeof (taskq_t *));
 			ztqs->tqs_taskq = NULL;
 			return (SET_ERROR(ENOMEM));
 		}
 	}
 
 	list_create(&zvol_state_list, sizeof (zvol_state_t),
 	    offsetof(zvol_state_t, zv_next));
 	rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL);
 
 	zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head),
 	    KM_SLEEP);
 	for (i = 0; i < ZVOL_HT_SIZE; i++)
 		INIT_HLIST_HEAD(&zvol_htable[i]);
 
 	return (0);
 }
 
 void
 zvol_fini_impl(void)
 {
 	zv_taskq_t *ztqs = &zvol_taskqs;
 
 	zvol_remove_minors_impl(NULL);
 
 	kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head));
 	list_destroy(&zvol_state_list);
 	rw_destroy(&zvol_state_lock);
 
 	if (ztqs->tqs_taskq == NULL) {
 		ASSERT0(ztqs->tqs_cnt);
 	} else {
 		for (uint_t i = 0; i < ztqs->tqs_cnt; i++) {
 			ASSERT3P(ztqs->tqs_taskq[i], !=, NULL);
 			taskq_destroy(ztqs->tqs_taskq[i]);
 		}
 		kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt *
 		    sizeof (taskq_t *));
 		ztqs->tqs_taskq = NULL;
 	}
 }
 
 ZFS_MODULE_PARAM(zfs_vol, zvol_, inhibit_dev, UINT, ZMOD_RW,
 	"Do not create zvol device nodes");
 ZFS_MODULE_PARAM(zfs_vol, zvol_, prefetch_bytes, UINT, ZMOD_RW,
 	"Prefetch N bytes at zvol start+end");
 ZFS_MODULE_PARAM(zfs_vol, zvol_vol, mode, UINT, ZMOD_RW,
 	"Default volmode property value");
 ZFS_MODULE_PARAM(zfs_vol, zvol_, threads, UINT, ZMOD_RW,
 	"Number of threads for I/O requests. Set to 0 to use all active CPUs");
 ZFS_MODULE_PARAM(zfs_vol, zvol_, num_taskqs, UINT, ZMOD_RW,
 	"Number of zvol taskqs");
 ZFS_MODULE_PARAM(zfs_vol, zvol_, request_sync, UINT, ZMOD_RW,
 	"Synchronously handle bio requests");
diff --git a/sys/contrib/openzfs/rpm/generic/zfs.spec.in b/sys/contrib/openzfs/rpm/generic/zfs.spec.in
index 1ce668e7b86d..edcfdd2d7136 100644
--- a/sys/contrib/openzfs/rpm/generic/zfs.spec.in
+++ b/sys/contrib/openzfs/rpm/generic/zfs.spec.in
@@ -1,604 +1,606 @@
 %global _sbindir    /sbin
 %global _libdir     /%{_lib}
 
 # Set the default udev directory based on distribution.
 %if %{undefined _udevdir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _udevdir    %{_prefix}/lib/udev
 %else
 %global _udevdir    /lib/udev
 %endif
 %endif
 
 # Set the default udevrule directory based on distribution.
 %if %{undefined _udevruledir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _udevruledir    %{_prefix}/lib/udev/rules.d
 %else
 %global _udevruledir    /lib/udev/rules.d
 %endif
 %endif
 
 # Set the default _bashcompletiondir directory based on distribution.
 %if %{undefined _bashcompletiondir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _bashcompletiondir    /etc/bash_completion.d
 %else
 %global _bashcompletiondir    /usr/share/bash-completion
 %endif
 %endif
 
 # Set the default dracut directory based on distribution.
 %if %{undefined _dracutdir}
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %global _dracutdir  %{_prefix}/lib/dracut
 %else
 %global _dracutdir  %{_prefix}/share/dracut
 %endif
 %endif
 
 %if %{undefined _initconfdir}
 %global _initconfdir /etc/sysconfig
 %endif
 
 %if %{undefined _unitdir}
 %global _unitdir %{_prefix}/lib/systemd/system
 %endif
 
 %if %{undefined _presetdir}
 %global _presetdir %{_prefix}/lib/systemd/system-preset
 %endif
 
 %if %{undefined _modulesloaddir}
 %global _modulesloaddir %{_prefix}/lib/modules-load.d
 %endif
 
 %if %{undefined _systemdgeneratordir}
 %global _systemdgeneratordir %{_prefix}/lib/systemd/system-generators
 %endif
 
 %if %{undefined _pkgconfigdir}
 %global _pkgconfigdir %{_prefix}/%{_lib}/pkgconfig
 %endif
 
 %bcond_with    debug
 %bcond_with    debuginfo
 %bcond_with    asan
 %bcond_with    ubsan
 %bcond_with    systemd
 %bcond_with    pam
 %bcond_without pyzfs
 
 # Generic enable switch for systemd
 %if %{with systemd}
 %define _systemd 1
 %endif
 
 # Distros below support systemd
 %if 0%{?rhel}%{?fedora}%{?centos}%{?suse_version}%{?openEuler}
 %define _systemd 1
 %endif
 
 # When not specified default to distribution provided version.
 %if %{undefined __use_python}
 %define __python                  /usr/bin/python3
 %define __python_pkg_version      3
 %else
 %define __python                  %{__use_python}
 %define __python_pkg_version      %{__use_python_pkg_version}
 %endif
 %define __python_sitelib          %(%{__python} -Esc "
 import sysconfig;
 if hasattr(sysconfig, 'get_default_scheme'):
     scheme = sysconfig.get_default_scheme()
 else:
     scheme = sysconfig._get_default_scheme()
 if scheme == 'posix_local':
     scheme = 'posix_prefix'
 prefix = '%{_prefix}'
 if prefix == 'NONE':
     prefix = '%{ac_default_prefix}'
 sitedir = sysconfig.get_path('purelib', scheme, vars={'base': prefix})
 print(sitedir);" 2>/dev/null || %{__python} -Esc "from distutils import sysconfig; print(sysconfig.get_python_lib(0,0))")
 
 Name:           @PACKAGE@
 Version:        @VERSION@
 Release:        @RELEASE@%{?dist}
 Summary:        Commands to control the kernel modules and libraries
 
 Group:          System Environment/Kernel
 License:        @ZFS_META_LICENSE@
 URL:            https://github.com/openzfs/zfs
 Source0:        %{name}-%{version}.tar.gz
 BuildRoot:      %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
 Requires:       libzpool6%{?_isa} = %{version}-%{release}
 Requires:       libnvpair3%{?_isa} = %{version}-%{release}
 Requires:       libuutil3%{?_isa} = %{version}-%{release}
 Requires:       libzfs6%{?_isa} = %{version}-%{release}
 Requires:       %{name}-kmod = %{version}
 Provides:       %{name}-kmod-common = %{version}-%{release}
 Obsoletes:      spl <= %{version}
 
 # zfs-fuse provides the same commands and man pages that OpenZFS does.
 # Renaming those on either side would conflict with all available documentation.
 Conflicts:      zfs-fuse
 
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 BuildRequires:  gcc, make
 BuildRequires:  zlib-devel
 BuildRequires:  libuuid-devel
 BuildRequires:  libblkid-devel
 BuildRequires:  libudev-devel
 BuildRequires:  libattr-devel
 BuildRequires:  openssl-devel
 %if 0%{?fedora}%{?suse_version}%{?openEuler} || 0%{?rhel} >= 8 || 0%{?centos} >= 8
 BuildRequires:  libtirpc-devel
 %endif
 
 %if (0%{?fedora}%{?suse_version}%{?openEuler}) || (0%{?rhel} && 0%{?rhel} < 9)
 # We don't directly use it, but if this isn't installed, rpmbuild as root can
 # crash+corrupt rpmdb
 # See issue #12071
 BuildRequires:  ncompress
 %endif
 
 Requires:       openssl
 %if 0%{?_systemd}
 BuildRequires: systemd
 %endif
 
 %endif
 
 %if 0%{?_systemd}
 Requires(post): systemd
 Requires(preun): systemd
 Requires(postun): systemd
 %endif
 
 # The zpool iostat/status -c scripts call some utilities like lsblk and iostat
 Requires:  util-linux
 Requires:  sysstat
 
 %description
 This package contains the core ZFS command line utilities.
 
 %package -n libzpool6
 Summary:        Native ZFS pool library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libzpool2 <= %{version}
 Obsoletes:      libzpool4 <= %{version}
 Obsoletes:      libzpool5 <= %{version}
 
 %description -n libzpool6
 This package contains the zpool library, which provides support
 for managing zpools
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libzpool6
 %else
 %post -n libzpool6 -p /sbin/ldconfig
 %postun -n libzpool6 -p /sbin/ldconfig
 %endif
 
 %package -n libnvpair3
 Summary:        Solaris name-value library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libnvpair1 <= %{version}
 
 %description -n libnvpair3
 This package contains routines for packing and unpacking name-value
 pairs.  This functionality is used to portably transport data across
 process boundaries, between kernel and user space, and can be used
 to write self describing data structures on disk.
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libnvpair3
 %else
 %post -n libnvpair3 -p /sbin/ldconfig
 %postun -n libnvpair3 -p /sbin/ldconfig
 %endif
 
 %package -n libuutil3
 Summary:        Solaris userland utility library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libuutil1 <= %{version}
 
 %description -n libuutil3
 This library provides a variety of compatibility functions for OpenZFS:
  * libspl: The Solaris Porting Layer userland library, which provides APIs
    that make it possible to run Solaris user code in a Linux environment
    with relatively minimal modification.
  * libavl: The Adelson-Velskii Landis balanced binary tree manipulation
    library.
  * libefi: The Extensible Firmware Interface library for GUID disk
    partitioning.
  * libshare: NFS, SMB, and iSCSI service integration for ZFS.
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libuutil3
 %else
 %post -n libuutil3 -p /sbin/ldconfig
 %postun -n libuutil3 -p /sbin/ldconfig
 %endif
 
 # The library version is encoded in the package name.  When updating the
 # version information it is important to add an obsoletes line below for
 # the previous version of the package.
 %package -n libzfs6
 Summary:        Native ZFS filesystem library for Linux
 Group:          System Environment/Kernel
 Obsoletes:      libzfs2 <= %{version}
 Obsoletes:      libzfs4 <= %{version}
 Obsoletes:      libzfs5 <= %{version}
 
 %description -n libzfs6
 This package provides support for managing ZFS filesystems
 
 %if %{defined ldconfig_scriptlets}
 %ldconfig_scriptlets -n libzfs6
 %else
 %post -n libzfs6 -p /sbin/ldconfig
 %postun -n libzfs6 -p /sbin/ldconfig
 %endif
 
 %package -n libzfs6-devel
 Summary:        Development headers
 Group:          System Environment/Kernel
 Requires:       libzfs6%{?_isa} = %{version}-%{release}
 Requires:       libzpool6%{?_isa} = %{version}-%{release}
 Requires:       libnvpair3%{?_isa} = %{version}-%{release}
 Requires:       libuutil3%{?_isa} = %{version}-%{release}
 Provides:       libzpool6-devel = %{version}-%{release}
 Provides:       libnvpair3-devel = %{version}-%{release}
 Provides:       libuutil3-devel = %{version}-%{release}
 Obsoletes:      zfs-devel <= %{version}
 Obsoletes:      libzfs2-devel <= %{version}
 Obsoletes:      libzfs4-devel <= %{version}
 Obsoletes:      libzfs5-devel <= %{version}
 
 %description -n libzfs6-devel
 This package contains the header files needed for building additional
 applications against the ZFS libraries.
 
 %package test
 Summary:        Test infrastructure
 Group:          System Environment/Kernel
 Requires:       %{name}%{?_isa} = %{version}-%{release}
 Requires:       parted
 Requires:       lsscsi
 Requires:       mdadm
 Requires:       bc
 Requires:       ksh
 Requires:       fio
 Requires:       acl
 Requires:       sudo
 Requires:       sysstat
 Requires:       libaio
 Requires:       python%{__python_pkg_version}
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 BuildRequires:  libaio-devel
 %endif
 AutoReqProv:    no
 
 %description test
 This package contains test infrastructure and support scripts for
 validating the file system.
 
 %package dracut
 Summary:        Dracut module
 Group:          System Environment/Kernel
 BuildArch:      noarch
 Requires:       %{name} >= %{version}
 Requires:       dracut
 Requires:       /usr/bin/awk
 Requires:       grep
 
 %description dracut
 This package contains a dracut module used to construct an initramfs
 image which is ZFS aware.
 
 %if %{with pyzfs}
 # Enforce `python36-` package prefix for CentOS 7
 # since dependencies come from EPEL and are named this way
 %package -n python%{__python_pkg_version}-pyzfs
 Summary:        Python %{python_version} wrapper for libzfs_core
 Group:          Development/Languages/Python
 License:        Apache-2.0
 BuildArch:      noarch
 Requires:       libzfs6 = %{version}-%{release}
 Requires:       libnvpair3 = %{version}-%{release}
 Requires:       libffi
 Requires:       python%{__python_pkg_version}
 
 %if 0%{?centos} == 7
 Requires:       python36-cffi
 %else
 Requires:       python%{__python_pkg_version}-cffi
 %endif
 
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 %if 0%{?centos} == 7
 BuildRequires:  python36-packaging
 BuildRequires:  python36-devel
 BuildRequires:  python36-cffi
 BuildRequires:  python36-setuptools
 %else
 BuildRequires:  python%{__python_pkg_version}-packaging
 BuildRequires:  python%{__python_pkg_version}-devel
 BuildRequires:  python%{__python_pkg_version}-cffi
 BuildRequires:  python%{__python_pkg_version}-setuptools
 %endif
 
 BuildRequires:  libffi-devel
 %endif
 
 %description -n python%{__python_pkg_version}-pyzfs
 This package provides a python wrapper for the libzfs_core C library.
 %endif
 
 %if 0%{?_initramfs}
 %package initramfs
 Summary:        Initramfs module
 Group:          System Environment/Kernel
 Requires:       %{name}%{?_isa} = %{version}-%{release}
 Requires:       initramfs-tools
 
 %description initramfs
 This package contains a initramfs module used to construct an initramfs
 image which is ZFS aware.
 %endif
 
 %if %{with pam}
 %package -n pam_zfs_key
 Summary:        PAM module for encrypted ZFS datasets
 
 %if 0%{?rhel}%{?centos}%{?fedora}%{?suse_version}%{?openEuler}
 BuildRequires:  pam-devel
 %endif
 
 %description -n pam_zfs_key
 This package contains the pam_zfs_key PAM module, which provides
 support for unlocking datasets on user login.
 %endif
 
 %prep
 %if %{with debug}
     %define debug --enable-debug
 %else
     %define debug --disable-debug
 %endif
 
 %if %{with debuginfo}
     %define debuginfo --enable-debuginfo
 %else
     %define debuginfo --disable-debuginfo
 %endif
 
 %if %{with asan}
     %define asan --enable-asan
 %else
     %define asan --disable-asan
 %endif
 
 %if %{with ubsan}
     %define ubsan --enable-ubsan
 %else
     %define ubsan --disable-ubsan
 %endif
 
 %if 0%{?_systemd}
     %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit
     %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-mount@.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target
 %else
     %define systemd --enable-sysvinit --disable-systemd
 %endif
 
 %if %{with pyzfs}
     %define pyzfs --enable-pyzfs
 %else
     %define pyzfs --disable-pyzfs
 %endif
 
 %if %{with pam}
     %define pam --enable-pam
 %else
     %define pam --disable-pam
 %endif
 
 %setup -q
 
 %build
 %configure \
     --with-config=user \
     --with-udevdir=%{_udevdir} \
     --with-udevruledir=%{_udevruledir} \
     --with-dracutdir=%{_dracutdir} \
     --with-pamconfigsdir=%{_datadir}/pam-configs \
     --with-pammoduledir=%{_libdir}/security \
     --with-python=%{__python} \
     --with-pkgconfigdir=%{_pkgconfigdir} \
     --disable-static \
     %{debug} \
     %{debuginfo} \
     %{asan} \
     %{ubsan} \
     %{systemd} \
     %{pam} \
     %{pyzfs}
 make %{?_smp_mflags}
 
 %install
 %{__rm} -rf $RPM_BUILD_ROOT
 make install DESTDIR=%{?buildroot}
 find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \;
 %if 0%{!?__brp_mangle_shebangs:1}
 find %{?buildroot}%{_bindir} \
     \( -name arc_summary -or -name arcstat -or -name dbufstat \
     -or -name zilstat \) \
     -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \;
 find %{?buildroot}%{_datadir} \
     \( -name test-runner.py -or -name zts-report.py \) \
     -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \;
 %endif
 
 %post
 %if 0%{?_systemd}
 %if 0%{?systemd_post:1}
 %systemd_post %{systemd_svcs}
 %else
 if [ "$1" = "1" -o "$1" = "install" ] ; then
     # Initial installation
     systemctl preset %{systemd_svcs} >/dev/null || true
 fi
 %endif
 %else
 if [ -x /sbin/chkconfig ]; then
     /sbin/chkconfig --add zfs-import
     /sbin/chkconfig --add zfs-load-key
     /sbin/chkconfig --add zfs-mount
     /sbin/chkconfig --add zfs-share
     /sbin/chkconfig --add zfs-zed
 fi
 %endif
 exit 0
 
 # On RHEL/CentOS 7 the static nodes aren't refreshed by default after
 # installing a package.  This is the default behavior for Fedora.
 %posttrans
 %if 0%{?rhel} == 7 || 0%{?centos} == 7
 systemctl restart kmod-static-nodes
 systemctl restart systemd-tmpfiles-setup-dev
 udevadm trigger
 %endif
 
 %preun
 %if 0%{?_systemd}
 %if 0%{?systemd_preun:1}
 %systemd_preun %{systemd_svcs}
 %else
 if [ "$1" = "0" -o "$1" = "remove" ] ; then
     # Package removal, not upgrade
     systemctl --no-reload disable %{systemd_svcs} >/dev/null || true
     systemctl stop %{systemd_svcs} >/dev/null || true
 fi
 %endif
 %else
 if [ "$1" = "0" -o "$1" = "remove" ] && [ -x /sbin/chkconfig ]; then
     /sbin/chkconfig --del zfs-import
     /sbin/chkconfig --del zfs-load-key
     /sbin/chkconfig --del zfs-mount
     /sbin/chkconfig --del zfs-share
     /sbin/chkconfig --del zfs-zed
 fi
 %endif
 exit 0
 
 %postun
 %if 0%{?_systemd}
 %if 0%{?systemd_postun:1}
 %systemd_postun %{systemd_svcs}
 %else
 systemctl --system daemon-reload >/dev/null || true
 %endif
 %endif
 
 %files
 # Core utilities
 %{_sbindir}/*
 %{_bindir}/raidz_test
 %{_bindir}/zvol_wait
 # Optional Python 3 scripts
 %{_bindir}/arc_summary
+%{_bindir}/zarcsummary
 %{_bindir}/arcstat
+%{_bindir}/zarcstat
 %{_bindir}/dbufstat
 %{_bindir}/zilstat
 # Man pages
 %{_mandir}/man1/*
 %{_mandir}/man4/*
 %{_mandir}/man5/*
 %{_mandir}/man7/*
 %{_mandir}/man8/*
 # Configuration files and scripts
 %{_libexecdir}/%{name}
 %{_udevdir}/vdev_id
 %{_udevdir}/zvol_id
 %{_udevdir}/rules.d/*
 %{_datadir}/%{name}/compatibility.d
 %if ! 0%{?_systemd} || 0%{?_initramfs}
 # Files needed for sysvinit and initramfs-tools
 %{_sysconfdir}/%{name}/zfs-functions
 %config(noreplace) %{_initconfdir}/zfs
 %else
 %exclude %{_sysconfdir}/%{name}/zfs-functions
 %exclude %{_initconfdir}/zfs
 %endif
 %if 0%{?_systemd}
 %{_unitdir}/*
 %{_presetdir}/*
 %{_modulesloaddir}/*
 %{_systemdgeneratordir}/*
 %else
 %config(noreplace) %{_sysconfdir}/init.d/*
 %endif
 %config(noreplace) %{_sysconfdir}/%{name}/zed.d/*
 %config(noreplace) %{_sysconfdir}/%{name}/zpool.d/*
 %config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example
 %attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/*
 
 %config(noreplace) %{_bashcompletiondir}/zfs
 %config(noreplace) %{_bashcompletiondir}/zpool
 
 %files -n libzpool6
 %{_libdir}/libzpool.so.*
 
 %files -n libnvpair3
 %{_libdir}/libnvpair.so.*
 
 %files -n libuutil3
 %{_libdir}/libuutil.so.*
 
 %files -n libzfs6
 %{_libdir}/libzfs*.so.*
 
 %files -n libzfs6-devel
 %{_pkgconfigdir}/libzfs.pc
 %{_pkgconfigdir}/libzfsbootenv.pc
 %{_pkgconfigdir}/libzfs_core.pc
 %{_libdir}/*.so
 %{_includedir}/*
 %doc AUTHORS COPYRIGHT LICENSE NOTICE README.md
 
 %files test
 %{_datadir}/%{name}/zfs-tests
 %{_datadir}/%{name}/test-runner
 %{_datadir}/%{name}/runfiles
 %{_datadir}/%{name}/*.sh
 
 %files dracut
 %doc contrib/dracut/README.md
 %{_dracutdir}/modules.d/*
 
 %if %{with pyzfs}
 %files -n python%{__python_pkg_version}-pyzfs
 %doc contrib/pyzfs/README
 %doc contrib/pyzfs/LICENSE
 %defattr(-,root,root,-)
 %{__python_sitelib}/libzfs_core/*
 %{__python_sitelib}/pyzfs*
 %endif
 
 %if 0%{?_initramfs}
 %files initramfs
 %doc contrib/initramfs/README.md
 /usr/share/initramfs-tools/*
 %else
 # Since we're not building the initramfs package,
 # ignore those files.
 %exclude /usr/share/initramfs-tools
 %endif
 
 %if %{with pam}
 %files -n pam_zfs_key
 %{_libdir}/security/*
 %{_datadir}/pam-configs/*
 %endif
diff --git a/sys/contrib/openzfs/tests/runfiles/linux.run b/sys/contrib/openzfs/tests/runfiles/linux.run
index f3d56acffde0..ba367fad402b 100644
--- a/sys/contrib/openzfs/tests/runfiles/linux.run
+++ b/sys/contrib/openzfs/tests/runfiles/linux.run
@@ -1,239 +1,239 @@
 # SPDX-License-Identifier: CDDL-1.0
 #
 # This file and its contents are supplied under the terms of the
 # Common Development and Distribution License ("CDDL"), version 1.0.
 # You may only use this file in accordance with the terms of version
 # 1.0 of the CDDL.
 #
 # A full copy of the text of the CDDL should have accompanied this
 # source.  A copy of the CDDL is also available via the Internet at
 # http://www.illumos.org/license/CDDL.
 #
 
 [DEFAULT]
 pre = setup
 quiet = False
 pre_user = root
 user = root
 timeout = 600
 post_user = root
 post = cleanup
 failsafe_user = root
 failsafe = callbacks/zfs_failsafe
 tags = ['functional']
 
 [tests/functional/acl/posix:Linux]
 tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos']
 tags = ['functional', 'acl', 'posix']
 
 [tests/functional/acl/posix-sa:Linux]
 tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos', 'posix_004_pos']
 tags = ['functional', 'acl', 'posix-sa']
 
 [tests/functional/atime:Linux]
 tests = ['atime_003_pos', 'root_relatime_on']
 tags = ['functional', 'atime']
 
 [tests/functional/block_cloning:Linux]
 tests = ['block_cloning_ficlone', 'block_cloning_ficlonerange',
     'block_cloning_ficlonerange_partial', 'block_cloning_disabled_ficlone',
     'block_cloning_disabled_ficlonerange']
 tags = ['functional', 'block_cloning']
 
 [tests/functional/chattr:Linux]
 tests = ['chattr_001_pos', 'chattr_002_neg']
 tags = ['functional', 'chattr']
 
 [tests/functional/cli_root/zfs:Linux]
 tests = ['zfs_003_neg']
 tags = ['functional', 'cli_root', 'zfs']
 
 [tests/functional/cli_root/zfs_mount:Linux]
 tests = ['zfs_mount_006_pos', 'zfs_mount_008_pos', 'zfs_mount_013_pos',
     'zfs_mount_014_neg', 'zfs_multi_mount']
 tags = ['functional', 'cli_root', 'zfs_mount']
 
 [tests/functional/cli_root/zfs_share:Linux]
 tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg',
     'zfs_share_012_pos', 'zfs_share_013_pos']
 tags = ['functional', 'cli_root', 'zfs_share']
 
 [tests/functional/cli_root/zfs_unshare:Linux]
 tests = ['zfs_unshare_008_pos']
 tags = ['functional', 'cli_root', 'zfs_unshare']
 
 [tests/functional/cli_root/zfs_sysfs:Linux]
 tests = ['zfeature_set_unsupported', 'zfs_get_unsupported',
     'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported',
     'zpool_set_unsupported']
 tags = ['functional', 'cli_root', 'zfs_sysfs']
 
 [tests/functional/cli_root/zpool_add:Linux]
 tests = ['add_nested_replacing_spare']
 tags = ['functional', 'cli_root', 'zpool_add']
 
 [tests/functional/cli_root/zpool_expand:Linux]
 tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos',
     'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos']
 tags = ['functional', 'cli_root', 'zpool_expand']
 
 [tests/functional/cli_root/zpool_import:Linux]
 tests = ['zpool_import_hostid_changed',
     'zpool_import_hostid_changed_unclean_export',
     'zpool_import_hostid_changed_cachefile',
     'zpool_import_hostid_changed_cachefile_unclean_export']
 tags = ['functional', 'cli_root', 'zpool_import']
 
 [tests/functional/cli_root/zpool_reopen:Linux]
 tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos',
     'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos',
     'zpool_reopen_006_neg', 'zpool_reopen_007_pos']
 tags = ['functional', 'cli_root', 'zpool_reopen']
 
 [tests/functional/cli_root/zpool_split:Linux]
 tests = ['zpool_split_wholedisk']
 tags = ['functional', 'cli_root', 'zpool_split']
 
 [tests/functional/compression:Linux]
 tests = ['compress_004_pos']
 tags = ['functional', 'compression']
 
 [tests/functional/devices:Linux]
 tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos']
 tags = ['functional', 'devices']
 
 [tests/functional/direct:Linux]
 tests = ['dio_loopback_dev', 'dio_write_verify']
 tags = ['functional', 'direct']
 
 [tests/functional/events:Linux]
 tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter', 'zed_fd_spill',
     'zed_cksum_reported', 'zed_cksum_config', 'zed_io_config',
     'zed_slow_io', 'zed_slow_io_many_vdevs', 'zed_diagnose_multiple']
 tags = ['functional', 'events']
 
 [tests/functional/fallocate:Linux]
 tests = ['fallocate_prealloc', 'fallocate_zero-range']
 tags = ['functional', 'fallocate']
 
 [tests/functional/fault:Linux]
 tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
     'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
     'auto_spare_002_pos', 'auto_spare_double', 'auto_spare_multiple',
     'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault',
     'decompress_fault', 'fault_limits', 'scrub_after_resilver',
     'suspend_on_probe_errors', 'suspend_resume_single', 'zpool_status_-s']
 tags = ['functional', 'fault']
 
 [tests/functional/features/large_dnode:Linux]
 tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos']
 tags = ['functional', 'features', 'large_dnode']
 
 [tests/functional/io:Linux]
 tests = ['libaio', 'io_uring']
 tags = ['functional', 'io']
 
 [tests/functional/largest_pool:Linux]
 tests = ['largest_pool_001_pos']
 pre =
 post =
 tags = ['functional', 'largest_pool']
 
 [tests/functional/longname:Linux]
 tests = ['longname_001_pos', 'longname_002_pos', 'longname_003_pos']
 tags = ['functional', 'longname']
 
 [tests/functional/luks:Linux]
 pre =
 post =
 tests = ['luks_sanity']
 tags = ['functional', 'luks']
 
 [tests/functional/mmap:Linux]
 tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos']
 tags = ['functional', 'mmap']
 
 [tests/functional/mmp:Linux]
 tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval',
     'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import',
     'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history',
     'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid', 'mmp_write_slow_disk']
 tags = ['functional', 'mmp']
 
 [tests/functional/mount:Linux]
-tests = ['umount_unlinked_drain']
+tests = ['umount_unlinked_drain', 'mount_loopback']
 tags = ['functional', 'mount']
 
 [tests/functional/pam:Linux]
 tests = ['pam_basic', 'pam_change_unmounted', 'pam_mount_recursively',
     'pam_nounmount', 'pam_recursive', 'pam_short_password']
 tags = ['functional', 'pam']
 
 [tests/functional/procfs:Linux]
 tests = ['procfs_list_basic', 'procfs_list_concurrent_readers',
     'procfs_list_stale_read', 'pool_state']
 tags = ['functional', 'procfs']
 
 [tests/functional/projectquota:Linux]
 tests = ['defaultprojectquota_001_pos', 'defaultprojectquota_005_pos',
     'projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos',
     'projectquota_001_pos', 'projectquota_003_pos', 'projectquota_006_pos',
     'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos',
     'projectspace_004_pos', 'projectspace_005_pos', 'projecttree_001_pos']
 tags = ['functional', 'projectquota']
 
 [tests/functional/dos_attributes:Linux]
 tests = ['read_dos_attrs_001', 'write_dos_attrs_001']
 tags = ['functional', 'dos_attributes']
 
 [tests/functional/renameat2:Linux]
 tests = ['renameat2_noreplace', 'renameat2_exchange', 'renameat2_whiteout']
 tags = ['functional', 'renameat2']
 
 [tests/functional/rsend:Linux]
 tests = ['send_realloc_dnode_size', 'send_encrypted_files', 'send-c_longname']
 tags = ['functional', 'rsend']
 
 [tests/functional/simd:Linux]
 pre =
 post =
 tests = ['simd_supported']
 tags = ['functional', 'simd']
 
 [tests/functional/snapshot:Linux]
 tests = ['snapshot_015_pos', 'snapshot_016_pos']
 tags = ['functional', 'snapshot']
 
 [tests/functional/syncfs:Linux]
 tests = ['syncfs_suspend']
 tags = ['functional', 'syncfs']
 pre =
 post =
 
 [tests/functional/tmpfile:Linux]
 tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos',
     'tmpfile_stat_mode']
 tags = ['functional', 'tmpfile']
 
 [tests/functional/upgrade:Linux]
 tests = ['upgrade_projectquota_001_pos', 'upgrade_projectquota_002_pos']
 tags = ['functional', 'upgrade']
 
 [tests/functional/user_namespace:Linux]
 tests = ['user_namespace_001', 'user_namespace_002', 'user_namespace_003',
     'user_namespace_004']
 tags = ['functional', 'user_namespace']
 
 [tests/functional/userquota:Linux]
 tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos',
     'groupspace_004_pos','userquota_013_pos', 'userspace_003_pos']
 tags = ['functional', 'userquota']
 
 [tests/functional/zvol/zvol_misc:Linux]
 tests = ['zvol_misc_fua']
 tags = ['functional', 'zvol', 'zvol_misc']
 
 [tests/functional/idmap_mount:Linux]
 tests = ['idmap_mount_001', 'idmap_mount_002', 'idmap_mount_003',
     'idmap_mount_004', 'idmap_mount_005']
 tags = ['functional', 'idmap_mount']
diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg
index 884a99d785bc..580281b30d7e 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg
+++ b/sys/contrib/openzfs/tests/zfs-tests/include/commands.cfg
@@ -1,241 +1,243 @@
 #
 # Copyright (c) 2016, 2019 by Delphix. All rights reserved.
 # These variables are used by zfs-tests.sh to constrain which utilities
 # may be used by the suite. The suite will create a directory which is
 # the only element of $PATH and create symlinks from that dir to the
 # binaries listed below.
 #
 # Please keep the contents of each variable sorted for ease of reading
 # and maintenance.
 #
 export SYSTEM_FILES_COMMON='awk
     basename
     bc
     bunzip2
     bzcat
     cat
     chgrp
     chmod
     chown
     cksum
     cmp
     cp
     cpio
     cut
     date
     dd
     df
     diff
     dirname
     dmesg
     du
     echo
     env
     expr
     false
     file
     find
     fio
     getconf
     getent
     getfacl
     grep
     gunzip
     gzip
     head
     hostname
     id
     iostat
     jq
     kill
     ksh
     ldd
     ln
     ls
     mkdir
     mknod
     mkfifo
     mktemp
     mount
     mv
     net
     od
     openssl
     pamtester
     pax
     pgrep
     ping
     pkill
     printf
     ps
     python3
     readlink
     rm
     rmdir
     rsync
     scp
     script
     sed
     seq
     setfacl
     sh
     sleep
     sort
     ssh
     stat
     strings
     sudo
     swapoff
     swapon
     sync
     tail
     tar
     timeout
     touch
     tr
     true
     truncate
     umount
     uname
     uniq
     vmstat
     wc
+    which
     xargs
     xxh128sum'
 
 export SYSTEM_FILES_FREEBSD='chflags
     compress
     diskinfo
     fsck
     getextattr
     gpart
     jail
     jexec
     jls
     lsextattr
     mdconfig
     newfs
     pw
     rmextattr
     setextattr
     showmount
     swapctl
     sysctl
     trim
     uncompress'
 
 export SYSTEM_FILES_LINUX='attr
     blkid
     blkdiscard
     blockdev
     chattr
     cryptsetup
     exportfs
     fallocate
     flock
     free
     getfattr
     groupadd
     groupdel
     groupmod
     hostid
     logger
     losetup
     lsattr
     lsblk
     lscpu
     lsmod
     lsscsi
+    mkfs.xfs
     mkswap
     modprobe
     mountpoint
     mpstat
     nsenter
     parted
     perf
     setfattr
     setpriv
     udevadm
     unshare
     useradd
     userdel
     usermod
     wipefs'
 
 export ZFS_FILES='zdb
     zfs
     zhack
     zinject
     zpool
     ztest
     raidz_test
     arc_summary
     arcstat
     zilstat
     dbufstat
     mount.zfs
     zed
     zgenhostid
     zstream
     zfs_ids_to_path
     zpool_influxdb'
 
 export ZFSTEST_FILES='badsend
     btree_test
     chg_usr_exec
     clonefile
     clone_mmap_cached
     clone_mmap_write
     crypto_test
     devname2devid
     dir_rd_update
     draid
     file_fadvise
     file_append
     file_check
     file_trunc
     file_write
     get_diff
     getversion
     largest_file
     libzfs_input_check
     manipulate_user_buffer
     mkbusy
     mkfile
     mkfiles
     mktree
     mmap_exec
     mmap_ftruncate
     mmap_libaio
     mmap_seek
     mmap_sync
     mmapwrite
     mmap_write_sync
     nvlist_to_lua
     randfree_file
     randwritecomp
     readmmap
     read_dos_attributes
     renameat2
     rename_dir
     rm_lnkcnt_zero_file
     send_doall
     statx
     threadsappend
     user_ns_exec
     write_dos_attributes
     xattrtest
     stride_dd
     zed_fd_spill-zedlet
     suid_write_to_file
     cp_files
     blake3_test
     edonr_test
     skein_test
     sha2_test
     ctime
     truncate_test
     ereports
     zfs_diff-socket
     dosmode_readonly_write
     idmap_util'
diff --git a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg
index e273c9f85c28..f2d7ceac0cbb 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg
+++ b/sys/contrib/openzfs/tests/zfs-tests/include/tunables.cfg
@@ -1,120 +1,120 @@
 # This file exports variables for each tunable used in the test suite.
 #
 # Different platforms use different names for most tunables. To avoid littering
 # the tests with conditional logic for deciding how to set each tunable, the
 # logic is instead consolidated to this one file.
 #
 # Any use of tunables in tests must use a name defined here. New entries
 # should be added to the table as needed. Please keep the table sorted
 # alphabetically for ease of maintenance.
 #
 # Platform-specific tunables should still use a NAME from this table for
 # consistency. Enter UNSUPPORTED in the column for platforms on which the
 # tunable is not implemented.
 
 UNAME=$(uname)
 
 # NAME				FreeBSD tunable			Linux tunable
 cat <<%%%% |
 ADMIN_SNAPSHOT			UNSUPPORTED			zfs_admin_snapshot
 ALLOW_REDACTED_DATASET_MOUNT	allow_redacted_dataset_mount	zfs_allow_redacted_dataset_mount
 ARC_MAX				arc.max				zfs_arc_max
 ARC_MIN				arc.min				zfs_arc_min
 ASYNC_BLOCK_MAX_BLOCKS		async_block_max_blocks		zfs_async_block_max_blocks
 CHECKSUM_EVENTS_PER_SECOND	checksum_events_per_second	zfs_checksum_events_per_second
 COMMIT_TIMEOUT_PCT		commit_timeout_pct		zfs_commit_timeout_pct
 COMPRESSED_ARC_ENABLED		compressed_arc_enabled		zfs_compressed_arc_enabled
 CONDENSE_INDIRECT_COMMIT_ENTRY_DELAY_MS	condense.indirect_commit_entry_delay_ms	zfs_condense_indirect_commit_entry_delay_ms
 CONDENSE_INDIRECT_OBSOLETE_PCT	condense.indirect_obsolete_pct	zfs_condense_indirect_obsolete_pct
 CONDENSE_MIN_MAPPING_BYTES	condense.min_mapping_bytes	zfs_condense_min_mapping_bytes
 DBUF_CACHE_SHIFT		dbuf.cache_shift		dbuf_cache_shift
 DDT_ZAP_DEFAULT_BS		dedup.ddt_zap_default_bs	ddt_zap_default_bs
 DDT_ZAP_DEFAULT_IBS		dedup.ddt_zap_default_ibs	ddt_zap_default_ibs
 DDT_DATA_IS_SPECIAL		ddt_data_is_special		zfs_ddt_data_is_special
 DEDUP_LOG_TXG_MAX		dedup.log_txg_max		zfs_dedup_log_txg_max
 DEDUP_LOG_FLUSH_ENTRIES_MAX	dedup.log_flush_entries_max	zfs_dedup_log_flush_entries_max
 DEDUP_LOG_FLUSH_ENTRIES_MIN	dedup.log_flush_entries_min	zfs_dedup_log_flush_entries_min
 DEADMAN_CHECKTIME_MS		deadman.checktime_ms		zfs_deadman_checktime_ms
 DEADMAN_EVENTS_PER_SECOND	deadman_events_per_second	zfs_deadman_events_per_second
 DEADMAN_FAILMODE		deadman.failmode		zfs_deadman_failmode
 DEADMAN_SYNCTIME_MS		deadman.synctime_ms		zfs_deadman_synctime_ms
 DEADMAN_ZIOTIME_MS		deadman.ziotime_ms		zfs_deadman_ziotime_ms
 DISABLE_IVSET_GUID_CHECK	disable_ivset_guid_check	zfs_disable_ivset_guid_check
 DMU_OFFSET_NEXT_SYNC		dmu_offset_next_sync		zfs_dmu_offset_next_sync
 EMBEDDED_SLOG_MIN_MS		embedded_slog_min_ms		zfs_embedded_slog_min_ms
 INITIALIZE_CHUNK_SIZE		initialize_chunk_size		zfs_initialize_chunk_size
 INITIALIZE_VALUE		initialize_value		zfs_initialize_value
 KEEP_LOG_SPACEMAPS_AT_EXPORT	keep_log_spacemaps_at_export	zfs_keep_log_spacemaps_at_export
 LUA_MAX_MEMLIMIT		lua.max_memlimit		zfs_lua_max_memlimit
 L2ARC_MFUONLY			l2arc.mfuonly			l2arc_mfuonly
 L2ARC_NOPREFETCH		l2arc.noprefetch		l2arc_noprefetch
 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE	l2arc.rebuild_blocks_min_l2size	l2arc_rebuild_blocks_min_l2size
 L2ARC_REBUILD_ENABLED		l2arc.rebuild_enabled		l2arc_rebuild_enabled
 L2ARC_TRIM_AHEAD		l2arc.trim_ahead		l2arc_trim_ahead
 L2ARC_WRITE_BOOST		l2arc.write_boost		l2arc_write_boost
 L2ARC_WRITE_MAX			l2arc.write_max			l2arc_write_max
 LIVELIST_CONDENSE_NEW_ALLOC	livelist.condense.new_alloc	zfs_livelist_condense_new_alloc
 LIVELIST_CONDENSE_SYNC_CANCEL	livelist.condense.sync_cancel	zfs_livelist_condense_sync_cancel
 LIVELIST_CONDENSE_SYNC_PAUSE	livelist.condense.sync_pause	zfs_livelist_condense_sync_pause
 LIVELIST_CONDENSE_ZTHR_CANCEL	livelist.condense.zthr_cancel	zfs_livelist_condense_zthr_cancel
 LIVELIST_CONDENSE_ZTHR_PAUSE	livelist.condense.zthr_pause	zfs_livelist_condense_zthr_pause
 LIVELIST_MAX_ENTRIES		livelist.max_entries		zfs_livelist_max_entries
 LIVELIST_MIN_PERCENT_SHARED	livelist.min_percent_shared	zfs_livelist_min_percent_shared
 MAX_DATASET_NESTING		max_dataset_nesting		zfs_max_dataset_nesting
 MAX_MISSING_TVDS		max_missing_tvds		zfs_max_missing_tvds
 METASLAB_DEBUG_LOAD		metaslab.debug_load		metaslab_debug_load
 METASLAB_FORCE_GANGING		metaslab.force_ganging		metaslab_force_ganging
 METASLAB_FORCE_GANGING_PCT	metaslab.force_ganging_pct	metaslab_force_ganging_pct
 MULTIHOST_FAIL_INTERVALS	multihost.fail_intervals	zfs_multihost_fail_intervals
 MULTIHOST_HISTORY		multihost.history		zfs_multihost_history
 MULTIHOST_IMPORT_INTERVALS	multihost.import_intervals	zfs_multihost_import_intervals
 MULTIHOST_INTERVAL		multihost.interval		zfs_multihost_interval
 OVERRIDE_ESTIMATE_RECORDSIZE	send.override_estimate_recordsize	zfs_override_estimate_recordsize
 PREFETCH_DISABLE		prefetch.disable		zfs_prefetch_disable
 RAIDZ_EXPAND_MAX_REFLOW_BYTES	vdev.expand_max_reflow_bytes	raidz_expand_max_reflow_bytes
 REBUILD_SCRUB_ENABLED		rebuild_scrub_enabled		zfs_rebuild_scrub_enabled
-REMOVAL_SUSPEND_PROGRESS	removal_suspend_progress	zfs_removal_suspend_progress
-REMOVE_MAX_SEGMENT		remove_max_segment		zfs_remove_max_segment
+REMOVAL_SUSPEND_PROGRESS	vdev.removal_suspend_progress	zfs_removal_suspend_progress
+REMOVE_MAX_SEGMENT		vdev.remove_max_segment		zfs_remove_max_segment
 RESILVER_MIN_TIME_MS		resilver_min_time_ms		zfs_resilver_min_time_ms
 RESILVER_DEFER_PERCENT		resilver_defer_percent		zfs_resilver_defer_percent
 SCAN_LEGACY			scan_legacy			zfs_scan_legacy
 SCAN_SUSPEND_PROGRESS		scan_suspend_progress		zfs_scan_suspend_progress
 SCAN_VDEV_LIMIT			scan_vdev_limit			zfs_scan_vdev_limit
 SCRUB_AFTER_EXPAND		scrub_after_expand		zfs_scrub_after_expand
 SEND_HOLES_WITHOUT_BIRTH_TIME	send_holes_without_birth_time	send_holes_without_birth_time
 SLOW_IO_EVENTS_PER_SECOND	slow_io_events_per_second	zfs_slow_io_events_per_second
 SPA_ASIZE_INFLATION		spa.asize_inflation		spa_asize_inflation
 SPA_DISCARD_MEMORY_LIMIT	spa.discard_memory_limit	zfs_spa_discard_memory_limit
 SPA_LOAD_VERIFY_DATA		spa.load_verify_data		spa_load_verify_data
 SPA_LOAD_VERIFY_METADATA	spa.load_verify_metadata	spa_load_verify_metadata
 SPA_NOTE_TXG_TIME		spa.note_txg_time		spa_note_txg_time
 TRIM_EXTENT_BYTES_MIN		trim.extent_bytes_min		zfs_trim_extent_bytes_min
 TRIM_METASLAB_SKIP		trim.metaslab_skip		zfs_trim_metaslab_skip
 TRIM_TXG_BATCH			trim.txg_batch			zfs_trim_txg_batch
 TXG_HISTORY			txg.history			zfs_txg_history
 TXG_TIMEOUT			txg.timeout			zfs_txg_timeout
 UNLINK_SUSPEND_PROGRESS		UNSUPPORTED			zfs_unlink_suspend_progress
 VDEV_FILE_LOGICAL_ASHIFT	vdev.file.logical_ashift	vdev_file_logical_ashift
 VDEV_FILE_PHYSICAL_ASHIFT	vdev.file.physical_ashift	vdev_file_physical_ashift
 VDEV_MAX_AUTO_ASHIFT		vdev.max_auto_ashift		zfs_vdev_max_auto_ashift
 VDEV_MIN_MS_COUNT		vdev.min_ms_count		zfs_vdev_min_ms_count
 VDEV_DIRECT_WR_VERIFY		vdev.direct_write_verify	zfs_vdev_direct_write_verify
 VDEV_VALIDATE_SKIP		vdev.validate_skip		vdev_validate_skip
 VOL_INHIBIT_DEV			vol.inhibit_dev			zvol_inhibit_dev
 VOL_MODE			vol.mode			zvol_volmode
 VOL_RECURSIVE			vol.recursive			UNSUPPORTED
 VOL_REQUEST_SYNC		vol.request_sync		zvol_request_sync
 VOL_USE_BLK_MQ			UNSUPPORTED			zvol_use_blk_mq
 BCLONE_ENABLED			bclone_enabled			zfs_bclone_enabled
 BCLONE_WAIT_DIRTY		bclone_wait_dirty		zfs_bclone_wait_dirty
 DIO_ENABLED			dio_enabled			zfs_dio_enabled
 DIO_STRICT			dio_strict			zfs_dio_strict
 XATTR_COMPAT			xattr_compat			zfs_xattr_compat
 ZEVENT_LEN_MAX			zevent.len_max			zfs_zevent_len_max
 ZEVENT_RETAIN_MAX		zevent.retain_max		zfs_zevent_retain_max
 ZIO_SLOW_IO_MS			zio.slow_io_ms			zio_slow_io_ms
 ZIL_SAXATTR			zil_saxattr			zfs_zil_saxattr
 %%%%
 while read name FreeBSD Linux; do
 	eval "export ${name}=\$${UNAME}"
 done
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
index 41e7b45ef4ec..94db292c9518 100644
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/Makefile.am
@@ -1,2262 +1,2263 @@
 CLEANFILES =
 dist_noinst_DATA =
 include $(top_srcdir)/config/Substfiles.am
 
 
 datadir_zfs_tests_testsdir = $(datadir)/$(PACKAGE)/zfs-tests/tests
 nobase_dist_datadir_zfs_tests_tests_DATA = \
 	perf/nfs-sample.cfg \
 	perf/perf.shlib \
 	\
 	perf/fio/mkfiles.fio \
 	perf/fio/random_reads.fio \
 	perf/fio/random_readwrite.fio \
 	perf/fio/random_readwrite_fixed.fio \
 	perf/fio/random_writes.fio \
 	perf/fio/sequential_reads.fio \
 	perf/fio/sequential_readwrite.fio \
 	perf/fio/sequential_writes.fio
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS = \
 	perf/regression/random_reads.ksh \
 	perf/regression/random_readwrite.ksh \
 	perf/regression/random_readwrite_fixed.ksh \
 	perf/regression/random_writes.ksh \
 	perf/regression/random_writes_zil.ksh \
 	perf/regression/sequential_reads_arc_cached_clone.ksh \
 	perf/regression/sequential_reads_arc_cached.ksh \
 	perf/regression/sequential_reads_dbuf_cached.ksh \
 	perf/regression/sequential_reads.ksh \
 	perf/regression/sequential_writes.ksh \
 	perf/regression/setup.ksh \
 	\
 	perf/scripts/prefetch_io.sh
 
 # These lists can be regenerated by running make regen-tests at the root, or, on a *clean* source:
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable   -name '*.in'                                              | sort | sed 's/\.in$//;s/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po' ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #   find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$!s/$/ \\/'
 #
 # simd and tmpfile are Linux-only and not installed elsewhere
 #
 # C programs are specced in ../Makefile.am above as part of the main Makefile
 
 find_common := find functional/ ! -type d ! -name .gitignore ! -name .dirstamp ! -name '*.Po'
 regen:
 	@$(MAKE) -C $(top_builddir) clean
 	@$(MAKE) clean
 	$(SED) $(ac_inplace) '/^# -- >8 --/q' Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_DATA = \' >> Makefile.am
 	$(find_common) ! -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \' >> Makefile.am
 	$(find_common)   -executable   -name '*.in'                                              | sort | sed 's/\.in$$//;s/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'SUBSTFILES += $$(nobase_nodist_datadir_zfs_tests_tests_DATA) $$(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)' >> Makefile.am
 	echo >> Makefile.am
 	echo 'if BUILD_LINUX' >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)               ! -name '*.in' ! -name '*.c'  | grep  -Fe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo 'endif' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_DATA += \' >> Makefile.am
 	$(find_common) ! -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 	echo >> Makefile.am
 	echo 'nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \' >> Makefile.am
 	$(find_common)   -executable ! -name '*.in' ! -name '*.c'  | grep -vFe /simd -e /tmpfile | sort | sed           's/^/\t/;$$!s/$$/ \\/' >> Makefile.am
 
 # -- >8 --
 
 nobase_nodist_datadir_zfs_tests_tests_DATA = \
 	functional/pam/utilities.kshlib
 nobase_nodist_datadir_zfs_tests_tests_SCRIPTS = \
 	functional/pyzfs/pyzfs_unittest.ksh
 
 SUBSTFILES += $(nobase_nodist_datadir_zfs_tests_tests_DATA) $(nobase_nodist_datadir_zfs_tests_tests_SCRIPTS)
 
 if BUILD_LINUX
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/simd/simd_supported.ksh \
 	functional/tmpfile/cleanup.ksh \
 	functional/tmpfile/setup.ksh \
 	functional/luks/luks_sanity.ksh
 endif
 
 nobase_dist_datadir_zfs_tests_tests_DATA += \
 	functional/acl/acl.cfg \
 	functional/acl/acl_common.kshlib \
 	functional/alloc_class/alloc_class.cfg \
 	functional/alloc_class/alloc_class.kshlib \
 	functional/atime/atime.cfg \
 	functional/atime/atime_common.kshlib \
 	functional/bclone/bclone.cfg \
 	functional/bclone/bclone_common.kshlib \
 	functional/bclone/bclone_corner_cases.kshlib \
 	functional/block_cloning/block_cloning.kshlib \
 	functional/cache/cache.cfg \
 	functional/cache/cache.kshlib \
 	functional/cachefile/cachefile.cfg \
 	functional/cachefile/cachefile.kshlib \
 	functional/casenorm/casenorm.cfg \
 	functional/casenorm/casenorm.kshlib \
 	functional/channel_program/channel_common.kshlib \
 	functional/channel_program/lua_core/tst.args_to_lua.out \
 	functional/channel_program/lua_core/tst.args_to_lua.zcp \
 	functional/channel_program/lua_core/tst.divide_by_zero.err \
 	functional/channel_program/lua_core/tst.divide_by_zero.zcp \
 	functional/channel_program/lua_core/tst.encryption.zcp \
 	functional/channel_program/lua_core/tst.exists.zcp \
 	functional/channel_program/lua_core/tst.large_prog.out \
 	functional/channel_program/lua_core/tst.large_prog.zcp \
 	functional/channel_program/lua_core/tst.lib_base.lua \
 	functional/channel_program/lua_core/tst.lib_coroutine.lua \
 	functional/channel_program/lua_core/tst.lib_strings.lua \
 	functional/channel_program/lua_core/tst.lib_table.lua \
 	functional/channel_program/lua_core/tst.nested_neg.zcp \
 	functional/channel_program/lua_core/tst.nested_pos.zcp \
 	functional/channel_program/lua_core/tst.recursive.zcp \
 	functional/channel_program/lua_core/tst.return_large.zcp \
 	functional/channel_program/lua_core/tst.return_recursive_table.zcp \
 	functional/channel_program/lua_core/tst.stack_gsub.err \
 	functional/channel_program/lua_core/tst.stack_gsub.zcp \
 	functional/channel_program/lua_core/tst.timeout.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.copy.zcp \
 	functional/channel_program/synctask_core/tst.bookmark.create.zcp \
 	functional/channel_program/synctask_core/tst.clone.zcp \
 	functional/channel_program/synctask_core/tst.get_index_props.out \
 	functional/channel_program/synctask_core/tst.get_index_props.zcp \
 	functional/channel_program/synctask_core/tst.get_number_props.out \
 	functional/channel_program/synctask_core/tst.get_number_props.zcp \
 	functional/channel_program/synctask_core/tst.get_string_props.out \
 	functional/channel_program/synctask_core/tst.get_string_props.zcp \
 	functional/channel_program/synctask_core/tst.promote_conflict.zcp \
 	functional/channel_program/synctask_core/tst.set_props.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_neg.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_rename.zcp \
 	functional/channel_program/synctask_core/tst.snapshot_simple.zcp \
 	functional/checksum/default.cfg \
 	functional/clean_mirror/clean_mirror_common.kshlib \
 	functional/clean_mirror/default.cfg \
 	functional/crypto/aes_ccm_test.json \
 	functional/crypto/aes_ccm_test.txt \
 	functional/crypto/aes_gcm_test.json \
 	functional/crypto/aes_gcm_test.txt \
 	functional/cli_root/cli_common.kshlib \
 	functional/cli_root/zfs_copies/zfs_copies.cfg \
 	functional/cli_root/zfs_copies/zfs_copies.kshlib \
 	functional/cli_root/zfs_create/properties.kshlib \
 	functional/cli_root/zfs_create/zfs_create.cfg \
 	functional/cli_root/zfs_create/zfs_create_common.kshlib \
 	functional/cli_root/zfs_destroy/zfs_destroy.cfg \
 	functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_common.kshlib \
 	functional/cli_root/zfs_get/zfs_get_list_d.kshlib \
 	functional/cli_root/zfs_jail/jail.conf \
 	functional/cli_root/zfs_load-key/HEXKEY \
 	functional/cli_root/zfs_load-key/PASSPHRASE \
 	functional/cli_root/zfs_load-key/RAWKEY \
 	functional/cli_root/zfs_load-key/zfs_load-key.cfg \
 	functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib \
 	functional/cli_root/zfs_mount/zfs_mount.cfg \
 	functional/cli_root/zfs_mount/zfs_mount.kshlib \
 	functional/cli_root/zfs_promote/zfs_promote.cfg \
 	functional/cli_root/zfs_receive/zstd_test_data.txt \
 	functional/cli_root/zfs_rename/zfs_rename.cfg \
 	functional/cli_root/zfs_rename/zfs_rename.kshlib \
 	functional/cli_root/zfs_rollback/zfs_rollback.cfg \
 	functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib \
 	functional/cli_root/zfs_send/zfs_send.cfg \
 	functional/cli_root/zfs_set/zfs_set_common.kshlib \
 	functional/cli_root/zfs_share/zfs_share.cfg \
 	functional/cli_root/zfs_snapshot/zfs_snapshot.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.cfg \
 	functional/cli_root/zfs_unmount/zfs_unmount.kshlib \
 	functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib \
 	functional/cli_root/zfs_wait/zfs_wait.kshlib \
 	functional/cli_root/zpool_add/zpool_add.cfg \
 	functional/cli_root/zpool_add/zpool_add.kshlib \
 	functional/cli_root/zpool_clear/zpool_clear.cfg \
 	functional/cli_root/zpool_create/draidcfg.gz \
 	functional/cli_root/zpool_create/zpool_create.cfg \
 	functional/cli_root/zpool_create/zpool_create.shlib \
 	functional/cli_root/zpool_destroy/zpool_destroy.cfg \
 	functional/cli_root/zpool_events/zpool_events.cfg \
 	functional/cli_root/zpool_events/zpool_events.kshlib \
 	functional/cli_root/zpool_expand/zpool_expand.cfg \
 	functional/cli_root/zpool_export/zpool_export.cfg \
 	functional/cli_root/zpool_export/zpool_export.kshlib \
 	functional/cli_root/zpool_get/vdev_get.cfg \
 	functional/cli_root/zpool_get/zpool_get.cfg \
 	functional/cli_root/zpool_get/zpool_get_parsable.cfg \
 	functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 \
 	functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 \
 	functional/cli_root/zpool_import/zpool_import.cfg \
 	functional/cli_root/zpool_import/zpool_import.kshlib \
 	functional/cli_root/zpool_initialize/zpool_initialize.kshlib \
 	functional/cli_root/zpool_labelclear/labelclear.cfg \
 	functional/cli_root/zpool_remove/zpool_remove.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.cfg \
 	functional/cli_root/zpool_reopen/zpool_reopen.shlib \
 	functional/cli_root/zpool_resilver/zpool_resilver.cfg \
 	functional/cli_root/zpool_scrub/zpool_scrub.cfg \
 	functional/cli_root/zpool_split/zpool_split.cfg \
 	functional/cli_root/zpool_trim/zpool_trim.kshlib \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 \
 	functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \
 	functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \
 	functional/cli_root/zpool_wait/zpool_wait.kshlib \
 	functional/cli_root/zhack/library.kshlib \
 	functional/cli_user/misc/misc.cfg \
 	functional/cli_user/zfs_list/zfs_list.cfg \
 	functional/cli_user/zfs_list/zfs_list.kshlib \
 	functional/compression/compress.cfg \
 	functional/compression/testpool_zstd.tar.gz \
 	functional/deadman/deadman.cfg \
 	functional/delegate/delegate.cfg \
 	functional/delegate/delegate_common.kshlib \
 	functional/devices/devices.cfg \
 	functional/devices/devices_common.kshlib \
 	functional/direct/dio.cfg \
 	functional/direct/dio.kshlib \
 	functional/events/events.cfg \
 	functional/events/events_common.kshlib \
 	functional/failmode/failmode.kshlib \
 	functional/fault/fault.cfg \
 	functional/gang_blocks/gang_blocks.kshlib \
 	functional/grow/grow.cfg \
 	functional/history/history.cfg \
 	functional/history/history_common.kshlib \
 	functional/history/i386.migratedpool.DAT.Z \
 	functional/history/i386.orig_history.txt \
 	functional/history/sparc.migratedpool.DAT.Z \
 	functional/history/sparc.orig_history.txt \
 	functional/history/zfs-pool-v4.dat.Z \
 	functional/inheritance/config001.cfg \
 	functional/inheritance/config002.cfg \
 	functional/inheritance/config003.cfg \
 	functional/inheritance/config004.cfg \
 	functional/inheritance/config005.cfg \
 	functional/inheritance/config006.cfg \
 	functional/inheritance/config007.cfg \
 	functional/inheritance/config008.cfg \
 	functional/inheritance/config009.cfg \
 	functional/inheritance/config010.cfg \
 	functional/inheritance/config011.cfg \
 	functional/inheritance/config012.cfg \
 	functional/inheritance/config013.cfg \
 	functional/inheritance/config014.cfg \
 	functional/inheritance/config015.cfg \
 	functional/inheritance/config016.cfg \
 	functional/inheritance/config017.cfg \
 	functional/inheritance/config018.cfg \
 	functional/inheritance/config019.cfg \
 	functional/inheritance/config020.cfg \
 	functional/inheritance/config021.cfg \
 	functional/inheritance/config022.cfg \
 	functional/inheritance/config023.cfg \
 	functional/inheritance/config024.cfg \
 	functional/inheritance/inherit.kshlib \
 	functional/inheritance/README.config \
 	functional/inheritance/README.state \
 	functional/inheritance/state001.cfg \
 	functional/inheritance/state002.cfg \
 	functional/inheritance/state003.cfg \
 	functional/inheritance/state004.cfg \
 	functional/inheritance/state005.cfg \
 	functional/inheritance/state006.cfg \
 	functional/inheritance/state007.cfg \
 	functional/inheritance/state008.cfg \
 	functional/inheritance/state009.cfg \
 	functional/inheritance/state010.cfg \
 	functional/inheritance/state011.cfg \
 	functional/inheritance/state012.cfg \
 	functional/inheritance/state013.cfg \
 	functional/inheritance/state014.cfg \
 	functional/inheritance/state015.cfg \
 	functional/inheritance/state016.cfg \
 	functional/inheritance/state017.cfg \
 	functional/inheritance/state018.cfg \
 	functional/inheritance/state019.cfg \
 	functional/inheritance/state020.cfg \
 	functional/inheritance/state021.cfg \
 	functional/inheritance/state022.cfg \
 	functional/inheritance/state023.cfg \
 	functional/inheritance/state024.cfg \
 	functional/inuse/inuse.cfg \
 	functional/io/io.cfg \
 	functional/l2arc/l2arc.cfg \
 	functional/largest_pool/largest_pool.cfg \
 	functional/migration/migration.cfg \
 	functional/migration/migration.kshlib \
 	functional/mmap/mmap.cfg \
 	functional/mmp/mmp.cfg \
 	functional/mmp/mmp.kshlib \
 	functional/mv_files/mv_files.cfg \
 	functional/mv_files/mv_files_common.kshlib \
 	functional/nopwrite/nopwrite.shlib \
 	functional/no_space/enospc.cfg \
 	functional/online_offline/online_offline.cfg \
 	functional/pool_checkpoint/pool_checkpoint.kshlib \
 	functional/projectquota/projectquota.cfg \
 	functional/projectquota/projectquota_common.kshlib \
 	functional/quota/quota.cfg \
 	functional/quota/quota.kshlib \
 	functional/redacted_send/redacted.cfg \
 	functional/redacted_send/redacted.kshlib \
 	functional/redundancy/redundancy.cfg \
 	functional/redundancy/redundancy.kshlib \
 	functional/refreserv/refreserv.cfg \
 	functional/removal/removal.kshlib \
 	functional/replacement/replacement.cfg \
 	functional/reservation/reservation.cfg \
 	functional/reservation/reservation.shlib \
 	functional/rsend/dedup_encrypted_zvol.bz2 \
 	functional/rsend/dedup_encrypted_zvol.zsend.bz2 \
 	functional/rsend/dedup.zsend.bz2 \
 	functional/rsend/fs.tar.gz \
 	functional/rsend/rsend.cfg \
 	functional/rsend/rsend.kshlib \
 	functional/scrub_mirror/default.cfg \
 	functional/scrub_mirror/scrub_mirror_common.kshlib \
 	functional/slog/slog.cfg \
 	functional/slog/slog.kshlib \
 	functional/snapshot/snapshot.cfg \
 	functional/snapused/snapused.kshlib \
 	functional/sparse/sparse.cfg \
 	functional/trim/trim.cfg \
 	functional/trim/trim.kshlib \
 	functional/truncate/truncate.cfg \
 	functional/upgrade/upgrade_common.kshlib \
 	functional/user_namespace/user_namespace.cfg \
 	functional/user_namespace/user_namespace_common.kshlib \
 	functional/userquota/13709_reproducer.bz2 \
 	functional/userquota/userquota.cfg \
 	functional/userquota/userquota_common.kshlib \
 	functional/vdev_zaps/vdev_zaps.kshlib \
 	functional/xattr/xattr.cfg \
 	functional/xattr/xattr_common.kshlib \
 	functional/zvol/zvol.cfg \
 	functional/zvol/zvol_cli/zvol_cli.cfg \
 	functional/zvol/zvol_common.shlib \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg \
 	functional/zvol/zvol_misc/zvol_misc_common.kshlib \
 	functional/zvol/zvol_swap/zvol_swap.cfg \
 	functional/idmap_mount/idmap_mount.cfg \
 	functional/idmap_mount/idmap_mount_common.kshlib
 
 nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
 	functional/acl/off/cleanup.ksh \
 	functional/acl/off/dosmode.ksh \
 	functional/acl/off/posixmode.ksh \
 	functional/acl/off/setup.ksh \
 	functional/acl/posix/cleanup.ksh \
 	functional/acl/posix/posix_001_pos.ksh \
 	functional/acl/posix/posix_002_pos.ksh \
 	functional/acl/posix/posix_003_pos.ksh \
 	functional/acl/posix/posix_004_pos.ksh \
 	functional/acl/posix-sa/cleanup.ksh \
 	functional/acl/posix-sa/posix_001_pos.ksh \
 	functional/acl/posix-sa/posix_002_pos.ksh \
 	functional/acl/posix-sa/posix_003_pos.ksh \
 	functional/acl/posix-sa/posix_004_pos.ksh \
 	functional/acl/posix-sa/setup.ksh \
 	functional/acl/posix/setup.ksh \
 	functional/alloc_class/alloc_class_001_pos.ksh \
 	functional/alloc_class/alloc_class_002_neg.ksh \
 	functional/alloc_class/alloc_class_003_pos.ksh \
 	functional/alloc_class/alloc_class_004_pos.ksh \
 	functional/alloc_class/alloc_class_005_pos.ksh \
 	functional/alloc_class/alloc_class_006_pos.ksh \
 	functional/alloc_class/alloc_class_007_pos.ksh \
 	functional/alloc_class/alloc_class_008_pos.ksh \
 	functional/alloc_class/alloc_class_009_pos.ksh \
 	functional/alloc_class/alloc_class_010_pos.ksh \
 	functional/alloc_class/alloc_class_011_neg.ksh \
 	functional/alloc_class/alloc_class_012_pos.ksh \
 	functional/alloc_class/alloc_class_013_pos.ksh \
 	functional/alloc_class/alloc_class_016_pos.ksh \
 	functional/alloc_class/cleanup.ksh \
 	functional/alloc_class/setup.ksh \
 	functional/append/file_append.ksh \
 	functional/append/threadsappend_001_pos.ksh \
 	functional/append/cleanup.ksh \
 	functional/append/setup.ksh \
 	functional/arc/arcstats_runtime_tuning.ksh \
 	functional/arc/cleanup.ksh \
 	functional/arc/dbufstats_001_pos.ksh \
 	functional/arc/dbufstats_002_pos.ksh \
 	functional/arc/dbufstats_003_pos.ksh \
 	functional/arc/setup.ksh \
 	functional/atime/atime_001_pos.ksh \
 	functional/atime/atime_002_neg.ksh \
 	functional/atime/atime_003_pos.ksh \
 	functional/atime/cleanup.ksh \
 	functional/atime/root_atime_off.ksh \
 	functional/atime/root_atime_on.ksh \
 	functional/atime/root_relatime_on.ksh \
 	functional/atime/setup.ksh \
 	functional/bclone/bclone_crossfs_corner_cases.ksh \
 	functional/bclone/bclone_crossfs_corner_cases_limited.ksh \
 	functional/bclone/bclone_crossfs_data.ksh \
 	functional/bclone/bclone_crossfs_embedded.ksh \
 	functional/bclone/bclone_crossfs_hole.ksh \
 	functional/bclone/bclone_diffprops_all.ksh \
 	functional/bclone/bclone_diffprops_checksum.ksh \
 	functional/bclone/bclone_diffprops_compress.ksh \
 	functional/bclone/bclone_diffprops_copies.ksh \
 	functional/bclone/bclone_diffprops_recordsize.ksh \
 	functional/bclone/bclone_prop_sync.ksh \
 	functional/bclone/bclone_samefs_corner_cases.ksh \
 	functional/bclone/bclone_samefs_corner_cases_limited.ksh \
 	functional/bclone/bclone_samefs_data.ksh \
 	functional/bclone/bclone_samefs_embedded.ksh \
 	functional/bclone/bclone_samefs_hole.ksh \
 	functional/bclone/cleanup.ksh \
 	functional/bclone/setup.ksh \
 	functional/block_cloning/cleanup.ksh \
 	functional/block_cloning/setup.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_cached.ksh \
 	functional/block_cloning/block_cloning_clone_mmap_write.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \
 	functional/block_cloning/block_cloning_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_copyfilerange_partial.ksh \
 	functional/block_cloning/block_cloning_disabled_copyfilerange.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlone.ksh \
 	functional/block_cloning/block_cloning_disabled_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlone.ksh \
 	functional/block_cloning/block_cloning_ficlonerange.ksh \
 	functional/block_cloning/block_cloning_ficlonerange_partial.ksh \
 	functional/block_cloning/block_cloning_cross_enc_dataset.ksh \
 	functional/block_cloning/block_cloning_replay.ksh \
 	functional/block_cloning/block_cloning_replay_encrypted.ksh \
 	functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \
 	functional/block_cloning/block_cloning_rlimit_fsize.ksh \
 	functional/block_cloning/block_cloning_large_offset.ksh \
 	functional/bootfs/bootfs_001_pos.ksh \
 	functional/bootfs/bootfs_002_neg.ksh \
 	functional/bootfs/bootfs_003_pos.ksh \
 	functional/bootfs/bootfs_004_neg.ksh \
 	functional/bootfs/bootfs_005_neg.ksh \
 	functional/bootfs/bootfs_006_pos.ksh \
 	functional/bootfs/bootfs_007_pos.ksh \
 	functional/bootfs/bootfs_008_pos.ksh \
 	functional/bootfs/cleanup.ksh \
 	functional/bootfs/setup.ksh \
 	functional/btree/btree_negative.ksh \
 	functional/btree/btree_positive.ksh \
 	functional/cache/cache_001_pos.ksh \
 	functional/cache/cache_002_pos.ksh \
 	functional/cache/cache_003_pos.ksh \
 	functional/cache/cache_004_neg.ksh \
 	functional/cache/cache_005_neg.ksh \
 	functional/cache/cache_006_pos.ksh \
 	functional/cache/cache_007_neg.ksh \
 	functional/cache/cache_008_neg.ksh \
 	functional/cache/cache_009_pos.ksh \
 	functional/cache/cache_010_pos.ksh \
 	functional/cache/cache_011_pos.ksh \
 	functional/cache/cache_012_pos.ksh \
 	functional/cache/cleanup.ksh \
 	functional/cachefile/cachefile_001_pos.ksh \
 	functional/cachefile/cachefile_002_pos.ksh \
 	functional/cachefile/cachefile_003_pos.ksh \
 	functional/cachefile/cachefile_004_pos.ksh \
 	functional/cachefile/cleanup.ksh \
 	functional/cachefile/setup.ksh \
 	functional/cache/setup.ksh \
 	functional/casenorm/case_all_values.ksh \
 	functional/casenorm/cleanup.ksh \
 	functional/casenorm/insensitive_formd_delete.ksh \
 	functional/casenorm/insensitive_formd_lookup.ksh \
 	functional/casenorm/insensitive_none_delete.ksh \
 	functional/casenorm/insensitive_none_lookup.ksh \
 	functional/casenorm/mixed_create_failure.ksh \
 	functional/casenorm/mixed_formd_delete.ksh \
 	functional/casenorm/mixed_formd_lookup_ci.ksh \
 	functional/casenorm/mixed_formd_lookup.ksh \
 	functional/casenorm/mixed_none_delete.ksh \
 	functional/casenorm/mixed_none_lookup_ci.ksh \
 	functional/casenorm/mixed_none_lookup.ksh \
 	functional/casenorm/norm_all_values.ksh \
 	functional/casenorm/sensitive_formd_delete.ksh \
 	functional/casenorm/sensitive_formd_lookup.ksh \
 	functional/casenorm/sensitive_none_delete.ksh \
 	functional/casenorm/sensitive_none_lookup.ksh \
 	functional/casenorm/setup.ksh \
 	functional/channel_program/lua_core/cleanup.ksh \
 	functional/channel_program/lua_core/setup.ksh \
 	functional/channel_program/lua_core/tst.args_to_lua.ksh \
 	functional/channel_program/lua_core/tst.divide_by_zero.ksh \
 	functional/channel_program/lua_core/tst.encryption.ksh \
 	functional/channel_program/lua_core/tst.exists.ksh \
 	functional/channel_program/lua_core/tst.integer_illegal.ksh \
 	functional/channel_program/lua_core/tst.integer_overflow.ksh \
 	functional/channel_program/lua_core/tst.language_functions_neg.ksh \
 	functional/channel_program/lua_core/tst.language_functions_pos.ksh \
 	functional/channel_program/lua_core/tst.large_prog.ksh \
 	functional/channel_program/lua_core/tst.libraries.ksh \
 	functional/channel_program/lua_core/tst.memory_limit.ksh \
 	functional/channel_program/lua_core/tst.nested_neg.ksh \
 	functional/channel_program/lua_core/tst.nested_pos.ksh \
 	functional/channel_program/lua_core/tst.nvlist_to_lua.ksh \
 	functional/channel_program/lua_core/tst.recursive_neg.ksh \
 	functional/channel_program/lua_core/tst.recursive_pos.ksh \
 	functional/channel_program/lua_core/tst.return_large.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_neg.ksh \
 	functional/channel_program/lua_core/tst.return_nvlist_pos.ksh \
 	functional/channel_program/lua_core/tst.return_recursive_table.ksh \
 	functional/channel_program/lua_core/tst.stack_gsub.ksh \
 	functional/channel_program/lua_core/tst.timeout.ksh \
 	functional/channel_program/synctask_core/cleanup.ksh \
 	functional/channel_program/synctask_core/setup.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.copy.ksh \
 	functional/channel_program/synctask_core/tst.bookmark.create.ksh \
 	functional/channel_program/synctask_core/tst.clone.ksh \
 	functional/channel_program/synctask_core/tst.destroy_fs.ksh \
 	functional/channel_program/synctask_core/tst.destroy_snap.ksh \
 	functional/channel_program/synctask_core/tst.get_count_and_limit.ksh \
 	functional/channel_program/synctask_core/tst.get_index_props.ksh \
 	functional/channel_program/synctask_core/tst.get_mountpoint.ksh \
 	functional/channel_program/synctask_core/tst.get_neg.ksh \
 	functional/channel_program/synctask_core/tst.get_number_props.ksh \
 	functional/channel_program/synctask_core/tst.get_string_props.ksh \
 	functional/channel_program/synctask_core/tst.get_type.ksh \
 	functional/channel_program/synctask_core/tst.get_userquota.ksh \
 	functional/channel_program/synctask_core/tst.get_written.ksh \
 	functional/channel_program/synctask_core/tst.inherit.ksh \
 	functional/channel_program/synctask_core/tst.list_bookmarks.ksh \
 	functional/channel_program/synctask_core/tst.list_children.ksh \
 	functional/channel_program/synctask_core/tst.list_clones.ksh \
 	functional/channel_program/synctask_core/tst.list_holds.ksh \
 	functional/channel_program/synctask_core/tst.list_snapshots.ksh \
 	functional/channel_program/synctask_core/tst.list_system_props.ksh \
 	functional/channel_program/synctask_core/tst.list_user_props.ksh \
 	functional/channel_program/synctask_core/tst.parse_args_neg.ksh \
 	functional/channel_program/synctask_core/tst.promote_conflict.ksh \
 	functional/channel_program/synctask_core/tst.promote_multiple.ksh \
 	functional/channel_program/synctask_core/tst.promote_simple.ksh \
 	functional/channel_program/synctask_core/tst.rollback_mult.ksh \
 	functional/channel_program/synctask_core/tst.rollback_one.ksh \
 	functional/channel_program/synctask_core/tst.set_props.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_destroy.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_neg.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_recursive.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_rename.ksh \
 	functional/channel_program/synctask_core/tst.snapshot_simple.ksh \
 	functional/channel_program/synctask_core/tst.terminate_by_signal.ksh \
 	functional/chattr/chattr_001_pos.ksh \
 	functional/chattr/chattr_002_neg.ksh \
 	functional/chattr/cleanup.ksh \
 	functional/chattr/setup.ksh \
 	functional/checksum/cleanup.ksh \
 	functional/checksum/filetest_001_pos.ksh \
 	functional/checksum/filetest_002_pos.ksh \
 	functional/checksum/run_blake3_test.ksh \
 	functional/checksum/run_edonr_test.ksh \
 	functional/checksum/run_sha2_test.ksh \
 	functional/checksum/run_skein_test.ksh \
 	functional/checksum/setup.ksh \
 	functional/clean_mirror/clean_mirror_001_pos.ksh \
 	functional/clean_mirror/clean_mirror_002_pos.ksh \
 	functional/clean_mirror/clean_mirror_003_pos.ksh \
 	functional/clean_mirror/clean_mirror_004_pos.ksh \
 	functional/clean_mirror/cleanup.ksh \
 	functional/clean_mirror/setup.ksh \
 	functional/cli_root/json/cleanup.ksh \
 	functional/cli_root/json/setup.ksh \
 	functional/cli_root/json/json_sanity.ksh \
 	functional/cli_root/zinject/zinject_args.ksh \
 	functional/cli_root/zinject/zinject_counts.ksh \
 	functional/cli_root/zinject/zinject_probe.ksh \
 	functional/cli_root/zdb/zdb_002_pos.ksh \
 	functional/cli_root/zdb/zdb_003_pos.ksh \
 	functional/cli_root/zdb/zdb_004_pos.ksh \
 	functional/cli_root/zdb/zdb_005_pos.ksh \
 	functional/cli_root/zdb/zdb_006_pos.ksh \
 	functional/cli_root/zdb/zdb_args_neg.ksh \
 	functional/cli_root/zdb/zdb_args_pos.ksh \
 	functional/cli_root/zdb/zdb_backup.ksh \
 	functional/cli_root/zdb/zdb_block_size_histogram.ksh \
 	functional/cli_root/zdb/zdb_checksum.ksh \
 	functional/cli_root/zdb/zdb_decompress.ksh \
 	functional/cli_root/zdb/zdb_decompress_zstd.ksh \
 	functional/cli_root/zdb/zdb_display_block.ksh \
 	functional/cli_root/zdb/zdb_encrypted.ksh \
 	functional/cli_root/zdb/zdb_label_checksum.ksh \
 	functional/cli_root/zdb/zdb_object_range_neg.ksh \
 	functional/cli_root/zdb/zdb_object_range_pos.ksh \
 	functional/cli_root/zdb/zdb_objset_id.ksh \
 	functional/cli_root/zdb/zdb_recover_2.ksh \
 	functional/cli_root/zdb/zdb_recover.ksh \
 	functional/cli_root/zdb/zdb_tunables.ksh \
 	functional/cli_root/zfs_bookmark/cleanup.ksh \
 	functional/cli_root/zfs_bookmark/setup.ksh \
 	functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh \
 	functional/cli_root/zfs_change-key/cleanup.ksh \
 	functional/cli_root/zfs_change-key/setup.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_child.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_format.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_load.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_location.ksh \
 	functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh \
 	functional/cli_root/zfs/cleanup.ksh \
 	functional/cli_root/zfs_clone/cleanup.ksh \
 	functional/cli_root/zfs_clone/setup.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh \
 	functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh \
 	functional/cli_root/zfs_copies/cleanup.ksh \
 	functional/cli_root/zfs_copies/setup.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh \
 	functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh \
 	functional/cli_root/zfs_create/cleanup.ksh \
 	functional/cli_root/zfs_create/setup.ksh \
 	functional/cli_root/zfs_create/zfs_create_001_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_002_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_003_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_004_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_005_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_006_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_007_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_008_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_009_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_010_neg.ksh \
 	functional/cli_root/zfs_create/zfs_create_011_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_012_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_013_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_014_pos.ksh \
 	functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh \
 	functional/cli_root/zfs_create/zfs_create_dryrun.ksh \
 	functional/cli_root/zfs_create/zfs_create_encrypted.ksh \
 	functional/cli_root/zfs_create/zfs_create_nomount.ksh \
 	functional/cli_root/zfs_create/zfs_create_verbose.ksh \
 	functional/cli_root/zfs_destroy/cleanup.ksh \
 	functional/cli_root/zfs_destroy/setup.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh \
 	functional/cli_root/zfs_destroy/zfs_clone_livelist_dedup.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh \
 	functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh \
 	functional/cli_root/zfs_diff/cleanup.ksh \
 	functional/cli_root/zfs_diff/setup.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_changes.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_mangle.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh \
 	functional/cli_root/zfs_diff/zfs_diff_types.ksh \
 	functional/cli_root/zfs_get/cleanup.ksh \
 	functional/cli_root/zfs_get/setup.ksh \
 	functional/cli_root/zfs_get/zfs_get_001_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_002_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_003_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_004_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_005_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_006_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_007_neg.ksh \
 	functional/cli_root/zfs_get/zfs_get_008_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_009_pos.ksh \
 	functional/cli_root/zfs_get/zfs_get_010_neg.ksh \
 	functional/cli_root/zfs_ids_to_path/cleanup.ksh \
 	functional/cli_root/zfs_ids_to_path/setup.ksh \
 	functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh \
 	functional/cli_root/zfs_inherit/cleanup.ksh \
 	functional/cli_root/zfs_inherit/setup.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh \
 	functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh \
 	functional/cli_root/zfs_jail/cleanup.ksh \
 	functional/cli_root/zfs_jail/setup.ksh \
 	functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh \
 	functional/cli_root/zfs_load-key/cleanup.ksh \
 	functional/cli_root/zfs_load-key/setup.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_all.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_file.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_https.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_location.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh \
 	functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh \
 	functional/cli_root/zfs_mount/cleanup.ksh \
 	functional/cli_root/zfs_mount/setup.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_013_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_014_neg.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_recursive.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_remount.ksh \
 	functional/cli_root/zfs_mount/zfs_mount_test_race.ksh \
 	functional/cli_root/zfs_mount/zfs_multi_mount.ksh \
 	functional/cli_root/zfs_program/cleanup.ksh \
 	functional/cli_root/zfs_program/setup.ksh \
 	functional/cli_root/zfs_program/zfs_program_json.ksh \
 	functional/cli_root/zfs_promote/cleanup.ksh \
 	functional/cli_root/zfs_promote/setup.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh \
 	functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh \
 	functional/cli_root/zfs_property/cleanup.ksh \
 	functional/cli_root/zfs_property/setup.ksh \
 	functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh \
 	functional/cli_root/zfs_receive/cleanup.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_aliases.ksh \
 	functional/cli_root/zfs_receive/receive-o-x_props_override.ksh \
 	functional/cli_root/zfs_receive/setup.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-e.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_new_props.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_raw.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_-wR-encrypted-mix.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_compressed_corrective.ksh \
 	functional/cli_root/zfs_receive/zfs_receive_large_block_corrective.ksh \
 	functional/cli_root/zfs_rename/cleanup.ksh \
 	functional/cli_root/zfs_rename/setup.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_nounmount.ksh \
 	functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh \
 	functional/cli_root/zfs_reservation/cleanup.ksh \
 	functional/cli_root/zfs_reservation/setup.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh \
 	functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh \
 	functional/cli_root/zfs_rewrite/cleanup.ksh \
 	functional/cli_root/zfs_rewrite/setup.ksh \
 	functional/cli_root/zfs_rewrite/zfs_rewrite.ksh \
 	functional/cli_root/zfs_rewrite/zfs_rewrite_physical.ksh \
 	functional/cli_root/zfs_rollback/cleanup.ksh \
 	functional/cli_root/zfs_rollback/setup.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh \
 	functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh \
 	functional/cli_root/zfs_send/cleanup.ksh \
 	functional/cli_root/zfs_send/setup.ksh \
 	functional/cli_root/zfs_send/zfs_send_001_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_002_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_003_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_004_neg.ksh \
 	functional/cli_root/zfs_send/zfs_send_005_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_006_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send_007_pos.ksh \
 	functional/cli_root/zfs_send/zfs_send-b.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted.ksh \
 	functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh \
 	functional/cli_root/zfs_send/zfs_send_raw.ksh \
 	functional/cli_root/zfs_send/zfs_send_skip_missing.ksh \
 	functional/cli_root/zfs_send/zfs_send_sparse.ksh \
 	functional/cli_root/zfs_set/cache_001_pos.ksh \
 	functional/cli_root/zfs_set/cache_002_neg.ksh \
 	functional/cli_root/zfs_set/canmount_001_pos.ksh \
 	functional/cli_root/zfs_set/canmount_002_pos.ksh \
 	functional/cli_root/zfs_set/canmount_003_pos.ksh \
 	functional/cli_root/zfs_set/canmount_004_pos.ksh \
 	functional/cli_root/zfs_set/checksum_001_pos.ksh \
 	functional/cli_root/zfs_set/cleanup.ksh \
 	functional/cli_root/zfs_set/compression_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_001_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_002_pos.ksh \
 	functional/cli_root/zfs_set/mountpoint_003_pos.ksh \
 	functional/cli_root/zfs_set/onoffs_001_pos.ksh \
 	functional/cli_root/zfs_set/property_alias_001_pos.ksh \
 	functional/cli_root/zfs_set/readonly_001_pos.ksh \
 	functional/cli_root/zfs_set/reservation_001_neg.ksh \
 	functional/cli_root/zfs_set/ro_props_001_pos.ksh \
 	functional/cli_root/zfs_set/setup.ksh \
 	functional/cli_root/zfs_set/share_mount_001_neg.ksh \
 	functional/cli_root/zfs_set/snapdir_001_pos.ksh \
 	functional/cli_root/zfs/setup.ksh \
 	functional/cli_root/zfs_set/user_property_001_pos.ksh \
 	functional/cli_root/zfs_set/user_property_002_pos.ksh \
 	functional/cli_root/zfs_set/user_property_003_neg.ksh \
 	functional/cli_root/zfs_set/user_property_004_pos.ksh \
 	functional/cli_root/zfs_set/version_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_001_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_002_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_003_neg.ksh \
 	functional/cli_root/zfs_set/zfs_set_feature_activation.ksh \
 	functional/cli_root/zfs_set/zfs_set_keylocation.ksh \
 	functional/cli_root/zfs_set/zfs_set_nomount.ksh \
 	functional/cli_root/zfs_share/cleanup.ksh \
 	functional/cli_root/zfs_share/setup.ksh \
 	functional/cli_root/zfs_share/zfs_share_001_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_002_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_003_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_004_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_005_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_006_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_007_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_008_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_009_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_010_neg.ksh \
 	functional/cli_root/zfs_share/zfs_share_011_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_012_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_013_pos.ksh \
 	functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \
 	functional/cli_root/zfs_share/zfs_share_after_mount.ksh \
 	functional/cli_root/zfs_snapshot/cleanup.ksh \
 	functional/cli_root/zfs_snapshot/setup.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh \
 	functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh \
 	functional/cli_root/zfs_sysfs/cleanup.ksh \
 	functional/cli_root/zfs_sysfs/setup.ksh \
 	functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh \
 	functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh \
 	functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh \
 	functional/cli_root/zfs_unload-key/cleanup.ksh \
 	functional/cli_root/zfs_unload-key/setup.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key.ksh \
 	functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh \
 	functional/cli_root/zfs_unmount/cleanup.ksh \
 	functional/cli_root/zfs_unmount/setup.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh \
 	functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh \
 	functional/cli_root/zfs_unshare/cleanup.ksh \
 	functional/cli_root/zfs_unshare/setup.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh \
 	functional/cli_root/zfs_unshare/zfs_unshare_008_pos.ksh \
 	functional/cli_root/zfs_upgrade/cleanup.ksh \
 	functional/cli_root/zfs_upgrade/setup.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh \
 	functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh \
 	functional/cli_root/zfs_wait/cleanup.ksh \
 	functional/cli_root/zfs_wait/setup.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh \
 	functional/cli_root/zfs_wait/zfs_wait_getsubopt.ksh \
 	functional/cli_root/zfs/zfs_001_neg.ksh \
 	functional/cli_root/zfs/zfs_002_pos.ksh \
 	functional/cli_root/zfs/zfs_003_neg.ksh \
 	functional/cli_root/zhack/zhack_label_repair_001.ksh \
 	functional/cli_root/zhack/zhack_label_repair_002.ksh \
 	functional/cli_root/zhack/zhack_label_repair_003.ksh \
 	functional/cli_root/zhack/zhack_label_repair_004.ksh \
 	functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \
 	functional/cli_root/zpool_add/add-o_ashift.ksh \
 	functional/cli_root/zpool_add/add_prop_ashift.ksh \
 	functional/cli_root/zpool_add/cleanup.ksh \
 	functional/cli_root/zpool_add/setup.ksh \
 	functional/cli_root/zpool_add/zpool_add_001_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_002_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_003_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_004_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_005_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_006_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_007_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_008_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_009_neg.ksh \
 	functional/cli_root/zpool_add/zpool_add_010_pos.ksh \
 	functional/cli_root/zpool_add/zpool_add_dryrun_output.ksh \
 	functional/cli_root/zpool_attach/attach-o_ashift.ksh \
 	functional/cli_root/zpool_attach/cleanup.ksh \
 	functional/cli_root/zpool_attach/setup.ksh \
 	functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh \
 	functional/cli_root/zpool/cleanup.ksh \
 	functional/cli_root/zpool_clear/cleanup.ksh \
 	functional/cli_root/zpool_clear/setup.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh \
 	functional/cli_root/zpool_clear/zpool_clear_readonly.ksh \
 	functional/cli_root/zpool_create/cleanup.ksh \
 	functional/cli_root/zpool_create/create-o_ashift.ksh \
 	functional/cli_root/zpool_create/setup.ksh \
 	functional/cli_root/zpool_create/zpool_create_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_007_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_009_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_010_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_011_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_012_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_014_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_015_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_016_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_017_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_018_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_019_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_020_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_021_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_022_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_023_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_024_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_draid_004_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_dryrun_output.ksh \
 	functional/cli_root/zpool_create/zpool_create_encrypted.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_006_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_007_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_008_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_features_009_pos.ksh \
 	functional/cli_root/zpool_create/zpool_create_tempname.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh \
 	functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh \
 	functional/cli_root/zpool_detach/cleanup.ksh \
 	functional/cli_root/zpool_detach/setup.ksh \
 	functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh \
 	functional/cli_root/zpool_events/cleanup.ksh \
 	functional/cli_root/zpool_events/setup.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear.ksh \
 	functional/cli_root/zpool_events/zpool_events_clear_retained.ksh \
 	functional/cli_root/zpool_events/zpool_events_cliargs.ksh \
 	functional/cli_root/zpool_events/zpool_events_duplicates.ksh \
 	functional/cli_root/zpool_events/zpool_events_errors.ksh \
 	functional/cli_root/zpool_events/zpool_events_follow.ksh \
 	functional/cli_root/zpool_events/zpool_events_poolname.ksh \
 	functional/cli_root/zpool_events/zpool_events_scrub_txg_continue_from_last.ksh \
 	functional/cli_root/zpool_expand/cleanup.ksh \
 	functional/cli_root/zpool_expand/setup.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh \
 	functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh \
 	functional/cli_root/zpool_export/cleanup.ksh \
 	functional/cli_root/zpool_export/setup.ksh \
 	functional/cli_root/zpool_export/zpool_export_001_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_002_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_003_neg.ksh \
 	functional/cli_root/zpool_export/zpool_export_004_pos.ksh \
 	functional/cli_root/zpool_export/zpool_export_parallel_admin.ksh \
 	functional/cli_root/zpool_export/zpool_export_parallel_pos.ksh \
 	functional/cli_root/zpool_get/cleanup.ksh \
 	functional/cli_root/zpool_get/setup.ksh \
 	functional/cli_root/zpool_get/vdev_get_001_pos.ksh \
 	functional/cli_root/zpool_get/vdev_get_all.ksh \
 	functional/cli_root/zpool_get/zpool_get_001_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_002_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_003_pos.ksh \
 	functional/cli_root/zpool_get/zpool_get_004_neg.ksh \
 	functional/cli_root/zpool_get/zpool_get_005_pos.ksh \
 	functional/cli_root/zpool_history/cleanup.ksh \
 	functional/cli_root/zpool_history/setup.ksh \
 	functional/cli_root/zpool_history/zpool_history_001_neg.ksh \
 	functional/cli_root/zpool_history/zpool_history_002_pos.ksh \
 	functional/cli_root/zpool_import/cleanup.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_added.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_removed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh \
 	functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \
 	functional/cli_root/zpool_import/import_devices_missing.ksh \
 	functional/cli_root/zpool_import/import_log_missing.ksh \
 	functional/cli_root/zpool_import/import_paths_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_config_changed.ksh \
 	functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \
 	functional/cli_root/zpool_import/setup.ksh \
 	functional/cli_root/zpool_import/zpool_import_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_004_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_005_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_006_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_007_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_008_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_009_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_010_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_011_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_012_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_013_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_014_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_015_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_016_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_017_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted.ksh \
 	functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata3.ksh \
 	functional/cli_root/zpool_import/zpool_import_errata4.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile.ksh \
 	functional/cli_root/zpool_import/zpool_import_hostid_changed_cachefile_unclean_export.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh \
 	functional/cli_root/zpool_import/zpool_import_status.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_admin.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_neg.ksh \
 	functional/cli_root/zpool_import/zpool_import_parallel_pos.ksh \
 	functional/cli_root/zpool_initialize/cleanup.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_fault_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_multiple_pools.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_split.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \
 	functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh \
 	functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh \
 	functional/cli_root/zpool_offline/cleanup.ksh \
 	functional/cli_root/zpool_offline/setup.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh \
 	functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh \
 	functional/cli_root/zpool_online/cleanup.ksh \
 	functional/cli_root/zpool_online/setup.ksh \
 	functional/cli_root/zpool_online/zpool_online_001_pos.ksh \
 	functional/cli_root/zpool_online/zpool_online_002_neg.ksh \
 	functional/cli_root/zpool_prefetch/cleanup.ksh \
 	functional/cli_root/zpool_prefetch/setup.ksh \
 	functional/cli_root/zpool_prefetch/zpool_prefetch_001_pos.ksh \
 	functional/cli_root/zpool_reguid/cleanup.ksh \
 	functional/cli_root/zpool_reguid/setup.ksh \
 	functional/cli_root/zpool_reguid/zpool_reguid_001_pos.ksh \
 	functional/cli_root/zpool_reguid/zpool_reguid_002_neg.ksh \
 	functional/cli_root/zpool_remove/cleanup.ksh \
 	functional/cli_root/zpool_remove/setup.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh \
 	functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh \
 	functional/cli_root/zpool_reopen/cleanup.ksh \
 	functional/cli_root/zpool_reopen/setup.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh \
 	functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh \
 	functional/cli_root/zpool_replace/cleanup.ksh \
 	functional/cli_root/zpool_replace/replace-o_ashift.ksh \
 	functional/cli_root/zpool_replace/replace_prop_ashift.ksh \
 	functional/cli_root/zpool_replace/setup.ksh \
 	functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh \
 	functional/cli_root/zpool_resilver/cleanup.ksh \
 	functional/cli_root/zpool_resilver/setup.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \
 	functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \
 	functional/cli_root/zpool_scrub/cleanup.ksh \
 	functional/cli_root/zpool_scrub/setup.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_multiple_pools.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_txg_continue_from_last.ksh \
 	functional/cli_root/zpool_scrub/zpool_scrub_date_range_001.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
 	functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \
 	functional/cli_root/zpool_set/cleanup.ksh \
 	functional/cli_root/zpool_set/setup.ksh \
 	functional/cli_root/zpool/setup.ksh \
 	functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_common.kshlib \
 	functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
 	functional/cli_root/zpool_set/zpool_set_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_003_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_ashift.ksh \
 	functional/cli_root/zpool_set/user_property_001_pos.ksh \
 	functional/cli_root/zpool_set/user_property_002_neg.ksh \
 	functional/cli_root/zpool_set/zpool_set_features.ksh \
 	functional/cli_root/zpool_set/zpool_set_clear_userprop.ksh \
 	functional/cli_root/zpool_split/cleanup.ksh \
 	functional/cli_root/zpool_split/setup.ksh \
 	functional/cli_root/zpool_split/zpool_split_cliargs.ksh \
 	functional/cli_root/zpool_split/zpool_split_devices.ksh \
 	functional/cli_root/zpool_split/zpool_split_dryrun_output.ksh \
 	functional/cli_root/zpool_split/zpool_split_encryption.ksh \
 	functional/cli_root/zpool_split/zpool_split_indirect.ksh \
 	functional/cli_root/zpool_split/zpool_split_props.ksh \
 	functional/cli_root/zpool_split/zpool_split_resilver.ksh \
 	functional/cli_root/zpool_split/zpool_split_vdevs.ksh \
 	functional/cli_root/zpool_split/zpool_split_wholedisk.ksh \
 	functional/cli_root/zpool_status/cleanup.ksh \
 	functional/cli_root/zpool_status/setup.ksh \
 	functional/cli_root/zpool_status/zpool_status_001_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_002_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_004_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_005_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_006_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_007_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_008_pos.ksh \
 	functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \
 	functional/cli_root/zpool_sync/cleanup.ksh \
 	functional/cli_root/zpool_sync/setup.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh \
 	functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh \
 	functional/cli_root/zpool_trim/cleanup.ksh \
 	functional/cli_root/zpool_trim/setup.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_fault_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_import_export.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_multiple.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_multiple_pools.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_partial.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_secure.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_split.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh \
 	functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh \
 	functional/cli_root/zpool_upgrade/cleanup.ksh \
 	functional/cli_root/zpool_upgrade/setup.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh \
 	functional/cli_root/zpool_upgrade/zpool_upgrade_features_001_pos.ksh \
 	functional/cli_root/zpool_wait/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/cleanup.ksh \
 	functional/cli_root/zpool_wait/scan/setup.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh \
 	functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh \
 	functional/cli_root/zpool_wait/setup.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_discard.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_freeing.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_multiple.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_remove.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh \
 	functional/cli_root/zpool_wait/zpool_wait_usage.ksh \
 	functional/cli_root/zpool/zpool_001_neg.ksh \
 	functional/cli_root/zpool/zpool_002_pos.ksh \
 	functional/cli_root/zpool/zpool_003_pos.ksh \
 	functional/cli_root/zpool/zpool_colors.ksh \
 	functional/cli_user/misc/arcstat_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_001_pos.ksh \
 	functional/cli_user/misc/arc_summary_002_neg.ksh \
 	functional/cli_user/misc/zilstat_001_pos.ksh \
 	functional/cli_user/misc/cleanup.ksh \
 	functional/cli_user/misc/setup.ksh \
 	functional/cli_user/misc/zdb_001_neg.ksh \
 	functional/cli_user/misc/zfs_001_neg.ksh \
 	functional/cli_user/misc/zfs_allow_001_neg.ksh \
 	functional/cli_user/misc/zfs_clone_001_neg.ksh \
 	functional/cli_user/misc/zfs_create_001_neg.ksh \
 	functional/cli_user/misc/zfs_destroy_001_neg.ksh \
 	functional/cli_user/misc/zfs_get_001_neg.ksh \
 	functional/cli_user/misc/zfs_inherit_001_neg.ksh \
 	functional/cli_user/misc/zfs_mount_001_neg.ksh \
 	functional/cli_user/misc/zfs_promote_001_neg.ksh \
 	functional/cli_user/misc/zfs_receive_001_neg.ksh \
 	functional/cli_user/misc/zfs_rename_001_neg.ksh \
 	functional/cli_user/misc/zfs_rollback_001_neg.ksh \
 	functional/cli_user/misc/zfs_send_001_neg.ksh \
 	functional/cli_user/misc/zfs_set_001_neg.ksh \
 	functional/cli_user/misc/zfs_share_001_neg.ksh \
 	functional/cli_user/misc/zfs_snapshot_001_neg.ksh \
 	functional/cli_user/misc/zfs_unallow_001_neg.ksh \
 	functional/cli_user/misc/zfs_unmount_001_neg.ksh \
 	functional/cli_user/misc/zfs_unshare_001_neg.ksh \
 	functional/cli_user/misc/zfs_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_001_neg.ksh \
 	functional/cli_user/misc/zpool_add_001_neg.ksh \
 	functional/cli_user/misc/zpool_attach_001_neg.ksh \
 	functional/cli_user/misc/zpool_clear_001_neg.ksh \
 	functional/cli_user/misc/zpool_create_001_neg.ksh \
 	functional/cli_user/misc/zpool_destroy_001_neg.ksh \
 	functional/cli_user/misc/zpool_detach_001_neg.ksh \
 	functional/cli_user/misc/zpool_export_001_neg.ksh \
 	functional/cli_user/misc/zpool_get_001_neg.ksh \
 	functional/cli_user/misc/zpool_history_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_001_neg.ksh \
 	functional/cli_user/misc/zpool_import_002_neg.ksh \
 	functional/cli_user/misc/zpool_offline_001_neg.ksh \
 	functional/cli_user/misc/zpool_online_001_neg.ksh \
 	functional/cli_user/misc/zpool_remove_001_neg.ksh \
 	functional/cli_user/misc/zpool_replace_001_neg.ksh \
 	functional/cli_user/misc/zpool_scrub_001_neg.ksh \
 	functional/cli_user/misc/zpool_set_001_neg.ksh \
 	functional/cli_user/misc/zpool_status_001_neg.ksh \
 	functional/cli_user/misc/zpool_upgrade_001_neg.ksh \
 	functional/cli_user/misc/zpool_wait_privilege.ksh \
 	functional/cli_user/zfs_list/cleanup.ksh \
 	functional/cli_user/zfs_list/setup.ksh \
 	functional/cli_user/zfs_list/zfs_list_001_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_002_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_003_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_004_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_005_neg.ksh \
 	functional/cli_user/zfs_list/zfs_list_007_pos.ksh \
 	functional/cli_user/zfs_list/zfs_list_008_neg.ksh \
 	functional/cli_user/zpool_iostat/cleanup.ksh \
 	functional/cli_user/zpool_iostat/setup.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh \
 	functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh \
 	functional/cli_user/zpool_list/cleanup.ksh \
 	functional/cli_user/zpool_list/setup.ksh \
 	functional/cli_user/zpool_list/zpool_list_001_pos.ksh \
 	functional/cli_user/zpool_list/zpool_list_002_neg.ksh \
 	functional/cli_user/zpool_status/cleanup.ksh \
 	functional/cli_user/zpool_status/setup.ksh \
 	functional/cli_user/zpool_status/zpool_status_003_pos.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_disable.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh \
 	functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh \
 	functional/compression/cleanup.ksh \
 	functional/compression/compress_001_pos.ksh \
 	functional/compression/compress_002_pos.ksh \
 	functional/compression/compress_003_pos.ksh \
 	functional/compression/compress_004_pos.ksh \
 	functional/compression/compress_zstd_bswap.ksh \
 	functional/compression/l2arc_compressed_arc_disabled.ksh \
 	functional/compression/l2arc_compressed_arc.ksh \
 	functional/compression/l2arc_encrypted.ksh \
 	functional/compression/l2arc_encrypted_no_compressed_arc.ksh \
 	functional/compression/setup.ksh \
 	functional/cp_files/cleanup.ksh \
 	functional/cp_files/cp_files_001_pos.ksh \
 	functional/cp_files/cp_files_002_pos.ksh \
 	functional/cp_files/cp_stress.ksh \
 	functional/cp_files/setup.ksh \
 	functional/crtime/cleanup.ksh \
 	functional/crtime/crtime_001_pos.ksh \
 	functional/crtime/setup.ksh \
 	functional/crypto/icp_aes_ccm.ksh \
 	functional/crypto/icp_aes_gcm.ksh \
 	functional/ctime/cleanup.ksh \
 	functional/ctime/ctime_001_pos.ksh \
 	functional/ctime/setup.ksh \
 	functional/deadman/deadman_ratelimit.ksh \
 	functional/deadman/deadman_sync.ksh \
 	functional/deadman/deadman_zio.ksh \
 	functional/dedup/cleanup.ksh \
 	functional/dedup/setup.ksh \
 	functional/dedup/dedup_fdt_create.ksh \
 	functional/dedup/dedup_fdt_import.ksh \
 	functional/dedup/dedup_fdt_pacing.ksh \
 	functional/dedup/dedup_legacy_create.ksh \
 	functional/dedup/dedup_legacy_import.ksh \
 	functional/dedup/dedup_legacy_fdt_upgrade.ksh \
 	functional/dedup/dedup_legacy_fdt_mixed.ksh \
 	functional/dedup/dedup_prune.ksh \
 	functional/dedup/dedup_quota.ksh \
 	functional/dedup/dedup_zap_shrink.ksh \
 	functional/delegate/cleanup.ksh \
 	functional/delegate/setup.ksh \
 	functional/delegate/zfs_allow_001_pos.ksh \
 	functional/delegate/zfs_allow_002_pos.ksh \
 	functional/delegate/zfs_allow_003_pos.ksh \
 	functional/delegate/zfs_allow_004_pos.ksh \
 	functional/delegate/zfs_allow_005_pos.ksh \
 	functional/delegate/zfs_allow_006_pos.ksh \
 	functional/delegate/zfs_allow_007_pos.ksh \
 	functional/delegate/zfs_allow_008_pos.ksh \
 	functional/delegate/zfs_allow_009_neg.ksh \
 	functional/delegate/zfs_allow_010_pos.ksh \
 	functional/delegate/zfs_allow_011_neg.ksh \
 	functional/delegate/zfs_allow_012_neg.ksh \
 	functional/delegate/zfs_unallow_001_pos.ksh \
 	functional/delegate/zfs_unallow_002_pos.ksh \
 	functional/delegate/zfs_unallow_003_pos.ksh \
 	functional/delegate/zfs_unallow_004_pos.ksh \
 	functional/delegate/zfs_unallow_005_pos.ksh \
 	functional/delegate/zfs_unallow_006_pos.ksh \
 	functional/delegate/zfs_unallow_007_neg.ksh \
 	functional/delegate/zfs_unallow_008_neg.ksh \
 	functional/devices/cleanup.ksh \
 	functional/devices/devices_001_pos.ksh \
 	functional/devices/devices_002_neg.ksh \
 	functional/devices/devices_003_pos.ksh \
 	functional/devices/setup.ksh \
 	functional/direct/dio_aligned_block.ksh \
 	functional/direct/dio_async_always.ksh \
 	functional/direct/dio_async_fio_ioengines.ksh \
 	functional/direct/dio_compression.ksh \
 	functional/direct/dio_dedup.ksh \
 	functional/direct/dio_encryption.ksh \
 	functional/direct/dio_grow_block.ksh \
 	functional/direct/dio_loopback_dev.ksh \
 	functional/direct/dio_max_recordsize.ksh \
 	functional/direct/dio_mixed.ksh \
 	functional/direct/dio_mmap.ksh \
 	functional/direct/dio_overwrites.ksh \
 	functional/direct/dio_property.ksh \
 	functional/direct/dio_random.ksh \
 	functional/direct/dio_read_verify.ksh \
 	functional/direct/dio_recordsize.ksh \
 	functional/direct/dio_unaligned_block.ksh \
 	functional/direct/dio_unaligned_filesize.ksh \
 	functional/direct/dio_write_verify.ksh \
 	functional/direct/dio_write_stable_pages.ksh \
 	functional/direct/setup.ksh \
 	functional/direct/cleanup.ksh \
 	functional/dos_attributes/cleanup.ksh \
 	functional/dos_attributes/read_dos_attrs_001.ksh \
 	functional/dos_attributes/setup.ksh \
 	functional/dos_attributes/write_dos_attrs_001.ksh \
 	functional/events/cleanup.ksh \
 	functional/events/events_001_pos.ksh \
 	functional/events/events_002_pos.ksh \
 	functional/events/setup.ksh \
 	functional/events/zed_cksum_config.ksh \
 	functional/events/zed_cksum_reported.ksh \
 	functional/events/zed_diagnose_multiple.ksh \
 	functional/events/zed_fd_spill.ksh \
 	functional/events/zed_io_config.ksh \
 	functional/events/zed_rc_filter.ksh \
 	functional/events/zed_slow_io.ksh \
 	functional/events/zed_slow_io_many_vdevs.ksh \
 	functional/exec/cleanup.ksh \
 	functional/exec/exec_001_pos.ksh \
 	functional/exec/exec_002_neg.ksh \
 	functional/exec/setup.ksh \
 	functional/fadvise/cleanup.ksh \
 	functional/fadvise/fadvise_willneed.ksh \
 	functional/fadvise/setup.ksh \
 	functional/failmode/cleanup.ksh \
 	functional/failmode/failmode_dmu_tx_wait.ksh \
 	functional/failmode/failmode_dmu_tx_continue.ksh \
 	functional/failmode/failmode_fsync_wait.ksh \
 	functional/failmode/failmode_fsync_continue.ksh \
 	functional/failmode/failmode_msync_wait.ksh \
 	functional/failmode/failmode_msync_continue.ksh \
 	functional/failmode/failmode_osync_wait.ksh \
 	functional/failmode/failmode_osync_continue.ksh \
 	functional/failmode/failmode_syncalways_wait.ksh \
 	functional/failmode/failmode_syncalways_continue.ksh \
 	functional/failmode/setup.ksh \
 	functional/fallocate/cleanup.ksh \
 	functional/fallocate/fallocate_prealloc.ksh \
 	functional/fallocate/fallocate_punch-hole.ksh \
 	functional/fallocate/fallocate_zero-range.ksh \
 	functional/fallocate/setup.ksh \
 	functional/fault/auto_offline_001_pos.ksh \
 	functional/fault/auto_online_001_pos.ksh \
 	functional/fault/auto_online_002_pos.ksh \
 	functional/fault/auto_replace_001_pos.ksh \
 	functional/fault/auto_replace_002_pos.ksh \
 	functional/fault/auto_spare_001_pos.ksh \
 	functional/fault/auto_spare_002_pos.ksh \
 	functional/fault/auto_spare_ashift.ksh \
 	functional/fault/auto_spare_double.ksh \
 	functional/fault/auto_spare_multiple.ksh \
 	functional/fault/auto_spare_shared.ksh \
 	functional/fault/cleanup.ksh \
 	functional/fault/decompress_fault.ksh \
 	functional/fault/decrypt_fault.ksh \
 	functional/fault/fault_limits.ksh \
 	functional/fault/scrub_after_resilver.ksh \
 	functional/fault/suspend_on_probe_errors.ksh \
 	functional/fault/suspend_resume_single.ksh \
 	functional/fault/setup.ksh \
 	functional/fault/zpool_status_-s.ksh \
 	functional/features/async_destroy/async_destroy_001_pos.ksh \
 	functional/features/async_destroy/cleanup.ksh \
 	functional/features/async_destroy/setup.ksh \
 	functional/features/large_dnode/cleanup.ksh \
 	functional/features/large_dnode/large_dnode_001_pos.ksh \
 	functional/features/large_dnode/large_dnode_002_pos.ksh \
 	functional/features/large_dnode/large_dnode_003_pos.ksh \
 	functional/features/large_dnode/large_dnode_004_neg.ksh \
 	functional/features/large_dnode/large_dnode_005_pos.ksh \
 	functional/features/large_dnode/large_dnode_006_pos.ksh \
 	functional/features/large_dnode/large_dnode_007_neg.ksh \
 	functional/features/large_dnode/large_dnode_008_pos.ksh \
 	functional/features/large_dnode/large_dnode_009_pos.ksh \
 	functional/features/large_dnode/setup.ksh \
 	functional/gang_blocks/cleanup.ksh \
 	functional/gang_blocks/gang_blocks_001_pos.ksh \
 	functional/gang_blocks/gang_blocks_ddt_copies.ksh \
 	functional/gang_blocks/gang_blocks_redundant.ksh \
 	functional/gang_blocks/gang_blocks_dyn_header_neg.ksh \
 	functional/gang_blocks/gang_blocks_dyn_header_pos.ksh \
 	functional/gang_blocks/gang_blocks_dyn_multi.ksh \
 	functional/gang_blocks/setup.ksh \
 	functional/grow/grow_pool_001_pos.ksh \
 	functional/grow/grow_replicas_001_pos.ksh \
 	functional/history/cleanup.ksh \
 	functional/history/history_001_pos.ksh \
 	functional/history/history_002_pos.ksh \
 	functional/history/history_003_pos.ksh \
 	functional/history/history_004_pos.ksh \
 	functional/history/history_005_neg.ksh \
 	functional/history/history_006_neg.ksh \
 	functional/history/history_007_pos.ksh \
 	functional/history/history_008_pos.ksh \
 	functional/history/history_009_pos.ksh \
 	functional/history/history_010_pos.ksh \
 	functional/history/setup.ksh \
 	functional/inheritance/cleanup.ksh \
 	functional/inheritance/inherit_001_pos.ksh \
 	functional/inuse/inuse_001_pos.ksh \
 	functional/inuse/inuse_003_pos.ksh \
 	functional/inuse/inuse_004_pos.ksh \
 	functional/inuse/inuse_005_pos.ksh \
 	functional/inuse/inuse_006_pos.ksh \
 	functional/inuse/inuse_007_pos.ksh \
 	functional/inuse/inuse_008_pos.ksh \
 	functional/inuse/inuse_009_pos.ksh \
 	functional/inuse/setup.ksh \
 	functional/io/cleanup.ksh \
 	functional/io/io_uring.ksh \
 	functional/io/libaio.ksh \
 	functional/io/mmap.ksh \
 	functional/io/posixaio.ksh \
 	functional/io/psync.ksh \
 	functional/io/setup.ksh \
 	functional/io/sync.ksh \
 	functional/l2arc/cleanup.ksh \
 	functional/l2arc/l2arc_arcstats_pos.ksh \
 	functional/l2arc/l2arc_l2miss_pos.ksh \
 	functional/l2arc/l2arc_mfuonly_pos.ksh \
 	functional/l2arc/persist_l2arc_001_pos.ksh \
 	functional/l2arc/persist_l2arc_002_pos.ksh \
 	functional/l2arc/persist_l2arc_003_neg.ksh \
 	functional/l2arc/persist_l2arc_004_pos.ksh \
 	functional/l2arc/persist_l2arc_005_pos.ksh \
 	functional/l2arc/setup.ksh \
 	functional/large_files/cleanup.ksh \
 	functional/large_files/large_files_001_pos.ksh \
 	functional/large_files/large_files_002_pos.ksh \
 	functional/large_files/setup.ksh \
 	functional/largest_pool/largest_pool_001_pos.ksh \
 	functional/libzfs/cleanup.ksh \
 	functional/libzfs/libzfs_input.ksh \
 	functional/libzfs/setup.ksh \
 	functional/limits/cleanup.ksh \
 	functional/limits/filesystem_count.ksh \
 	functional/limits/filesystem_limit.ksh \
 	functional/limits/setup.ksh \
 	functional/limits/snapshot_count.ksh \
 	functional/limits/snapshot_limit.ksh \
 	functional/link_count/cleanup.ksh \
 	functional/link_count/link_count_001.ksh \
 	functional/link_count/link_count_root_inode.ksh \
 	functional/link_count/setup.ksh \
 	functional/longname/cleanup.ksh \
 	functional/longname/longname_001_pos.ksh \
 	functional/longname/longname_002_pos.ksh \
 	functional/longname/longname_003_pos.ksh \
 	functional/longname/setup.ksh \
 	functional/log_spacemap/log_spacemap_import_logs.ksh \
 	functional/migration/cleanup.ksh \
 	functional/migration/migration_001_pos.ksh \
 	functional/migration/migration_002_pos.ksh \
 	functional/migration/migration_003_pos.ksh \
 	functional/migration/migration_004_pos.ksh \
 	functional/migration/migration_005_pos.ksh \
 	functional/migration/migration_006_pos.ksh \
 	functional/migration/migration_007_pos.ksh \
 	functional/migration/migration_008_pos.ksh \
 	functional/migration/migration_009_pos.ksh \
 	functional/migration/migration_010_pos.ksh \
 	functional/migration/migration_011_pos.ksh \
 	functional/migration/migration_012_pos.ksh \
 	functional/migration/setup.ksh \
 	functional/mmap/cleanup.ksh \
 	functional/mmap/mmap_libaio_001_pos.ksh \
 	functional/mmap/mmap_mixed.ksh \
 	functional/mmap/mmap_read_001_pos.ksh \
 	functional/mmap/mmap_seek_001_pos.ksh \
 	functional/mmap/mmap_sync_001_pos.ksh \
 	functional/mmap/mmap_write_001_pos.ksh \
 	functional/mmap/mmap_ftruncate.ksh \
 	functional/mmap/setup.ksh \
 	functional/mmp/cleanup.ksh \
 	functional/mmp/mmp_active_import.ksh \
 	functional/mmp/mmp_exported_import.ksh \
 	functional/mmp/mmp_hostid.ksh \
 	functional/mmp/mmp_inactive_import.ksh \
 	functional/mmp/mmp_interval.ksh \
 	functional/mmp/mmp_on_off.ksh \
 	functional/mmp/mmp_on_thread.ksh \
 	functional/mmp/mmp_on_uberblocks.ksh \
 	functional/mmp/mmp_on_zdb.ksh \
 	functional/mmp/mmp_reset_interval.ksh \
 	functional/mmp/mmp_write_distribution.ksh \
 	functional/mmp/mmp_write_slow_disk.ksh \
 	functional/mmp/mmp_write_uberblocks.ksh \
 	functional/mmp/multihost_history.ksh \
 	functional/mmp/setup.ksh \
 	functional/mount/cleanup.ksh \
 	functional/mount/setup.ksh \
+	functional/mount/mount_loopback.ksh \
 	functional/mount/umount_001.ksh \
 	functional/mount/umountall_001.ksh \
 	functional/mount/umount_unlinked_drain.ksh \
 	functional/mv_files/cleanup.ksh \
 	functional/mv_files/mv_files_001_pos.ksh \
 	functional/mv_files/mv_files_002_pos.ksh \
 	functional/mv_files/random_creation.ksh \
 	functional/mv_files/setup.ksh \
 	functional/nestedfs/cleanup.ksh \
 	functional/nestedfs/nestedfs_001_pos.ksh \
 	functional/nestedfs/setup.ksh \
 	functional/nopwrite/cleanup.ksh \
 	functional/nopwrite/nopwrite_copies.ksh \
 	functional/nopwrite/nopwrite_mtime.ksh \
 	functional/nopwrite/nopwrite_negative.ksh \
 	functional/nopwrite/nopwrite_promoted_clone.ksh \
 	functional/nopwrite/nopwrite_recsize.ksh \
 	functional/nopwrite/nopwrite_sync.ksh \
 	functional/nopwrite/nopwrite_varying_compression.ksh \
 	functional/nopwrite/nopwrite_volume.ksh \
 	functional/nopwrite/setup.ksh \
 	functional/no_space/cleanup.ksh \
 	functional/no_space/enospc_001_pos.ksh \
 	functional/no_space/enospc_002_pos.ksh \
 	functional/no_space/enospc_003_pos.ksh \
 	functional/no_space/enospc_df.ksh \
 	functional/no_space/enospc_ganging.ksh \
 	functional/no_space/enospc_rm.ksh \
 	functional/no_space/setup.ksh \
 	functional/online_offline/cleanup.ksh \
 	functional/online_offline/online_offline_001_pos.ksh \
 	functional/online_offline/online_offline_002_neg.ksh \
 	functional/online_offline/online_offline_003_neg.ksh \
 	functional/online_offline/setup.ksh \
 	functional/pam/cleanup.ksh \
 	functional/pam/pam_basic.ksh \
 	functional/pam/pam_change_unmounted.ksh \
 	functional/pam/pam_mount_recursively.ksh \
 	functional/pam/pam_nounmount.ksh \
 	functional/pam/pam_recursive.ksh \
 	functional/pam/pam_short_password.ksh \
 	functional/pam/setup.ksh \
 	functional/pool_checkpoint/checkpoint_after_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_big_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_capacity.ksh \
 	functional/pool_checkpoint/checkpoint_conf_change.ksh \
 	functional/pool_checkpoint/checkpoint_discard_busy.ksh \
 	functional/pool_checkpoint/checkpoint_discard.ksh \
 	functional/pool_checkpoint/checkpoint_discard_many.ksh \
 	functional/pool_checkpoint/checkpoint_indirect.ksh \
 	functional/pool_checkpoint/checkpoint_invalid.ksh \
 	functional/pool_checkpoint/checkpoint_lun_expsz.ksh \
 	functional/pool_checkpoint/checkpoint_open.ksh \
 	functional/pool_checkpoint/checkpoint_removal.ksh \
 	functional/pool_checkpoint/checkpoint_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_ro_rewind.ksh \
 	functional/pool_checkpoint/checkpoint_sm_scale.ksh \
 	functional/pool_checkpoint/checkpoint_twice.ksh \
 	functional/pool_checkpoint/checkpoint_vdev_add.ksh \
 	functional/pool_checkpoint/checkpoint_zdb.ksh \
 	functional/pool_checkpoint/checkpoint_zhack_feat.ksh \
 	functional/pool_checkpoint/cleanup.ksh \
 	functional/pool_checkpoint/setup.ksh \
 	functional/pool_names/pool_names_001_pos.ksh \
 	functional/pool_names/pool_names_002_neg.ksh \
 	functional/poolversion/cleanup.ksh \
 	functional/poolversion/poolversion_001_pos.ksh \
 	functional/poolversion/poolversion_002_pos.ksh \
 	functional/poolversion/setup.ksh \
 	functional/privilege/cleanup.ksh \
 	functional/privilege/privilege_001_pos.ksh \
 	functional/privilege/privilege_002_pos.ksh \
 	functional/privilege/setup.ksh \
 	functional/procfs/cleanup.ksh \
 	functional/procfs/pool_state.ksh \
 	functional/procfs/procfs_list_basic.ksh \
 	functional/procfs/procfs_list_concurrent_readers.ksh \
 	functional/procfs/procfs_list_stale_read.ksh \
 	functional/procfs/setup.ksh \
 	functional/projectquota/cleanup.ksh \
 	functional/projectquota/projectid_001_pos.ksh \
 	functional/projectquota/projectid_002_pos.ksh \
 	functional/projectquota/projectid_003_pos.ksh \
 	functional/projectquota/projectquota_001_pos.ksh \
 	functional/projectquota/projectquota_002_pos.ksh \
 	functional/projectquota/projectquota_003_pos.ksh \
 	functional/projectquota/projectquota_004_neg.ksh \
 	functional/projectquota/projectquota_005_pos.ksh \
 	functional/projectquota/projectquota_006_pos.ksh \
 	functional/projectquota/projectquota_007_pos.ksh \
 	functional/projectquota/projectquota_008_pos.ksh \
 	functional/projectquota/projectquota_009_pos.ksh \
 	functional/projectquota/defaultprojectquota_001_pos.ksh \
 	functional/projectquota/defaultprojectquota_002_pos.ksh \
 	functional/projectquota/defaultprojectquota_003_neg.ksh \
 	functional/projectquota/defaultprojectquota_004_pos.ksh \
 	functional/projectquota/defaultprojectquota_005_pos.ksh \
 	functional/projectquota/defaultprojectquota_006_pos.ksh \
 	functional/projectquota/defaultprojectquota_007_pos.ksh \
 	functional/projectquota/projectspace_001_pos.ksh \
 	functional/projectquota/projectspace_002_pos.ksh \
 	functional/projectquota/projectspace_003_pos.ksh \
 	functional/projectquota/projectspace_004_pos.ksh \
 	functional/projectquota/projectspace_005_pos.ksh \
 	functional/projectquota/projecttree_001_pos.ksh \
 	functional/projectquota/projecttree_002_pos.ksh \
 	functional/projectquota/projecttree_003_neg.ksh \
 	functional/projectquota/setup.ksh \
 	functional/quota/cleanup.ksh \
 	functional/quota/quota_001_pos.ksh \
 	functional/quota/quota_002_pos.ksh \
 	functional/quota/quota_003_pos.ksh \
 	functional/quota/quota_004_pos.ksh \
 	functional/quota/quota_005_pos.ksh \
 	functional/quota/quota_006_neg.ksh \
 	functional/quota/setup.ksh \
 	functional/raidz/cleanup.ksh \
 	functional/raidz/raidz_001_neg.ksh \
 	functional/raidz/raidz_002_pos.ksh \
 	functional/raidz/raidz_expand_001_pos.ksh \
 	functional/raidz/raidz_expand_002_pos.ksh \
 	functional/raidz/raidz_expand_003_neg.ksh \
 	functional/raidz/raidz_expand_003_pos.ksh \
 	functional/raidz/raidz_expand_004_pos.ksh \
 	functional/raidz/raidz_expand_005_pos.ksh \
 	functional/raidz/raidz_expand_006_neg.ksh \
 	functional/raidz/raidz_expand_007_neg.ksh \
 	functional/raidz/setup.ksh \
 	functional/redacted_send/cleanup.ksh \
 	functional/redacted_send/redacted_compressed.ksh \
 	functional/redacted_send/redacted_contents.ksh \
 	functional/redacted_send/redacted_deleted.ksh \
 	functional/redacted_send/redacted_disabled_feature.ksh \
 	functional/redacted_send/redacted_embedded.ksh \
 	functional/redacted_send/redacted_holes.ksh \
 	functional/redacted_send/redacted_incrementals.ksh \
 	functional/redacted_send/redacted_largeblocks.ksh \
 	functional/redacted_send/redacted_many_clones.ksh \
 	functional/redacted_send/redacted_mixed_recsize.ksh \
 	functional/redacted_send/redacted_mounts.ksh \
 	functional/redacted_send/redacted_negative.ksh \
 	functional/redacted_send/redacted_origin.ksh \
 	functional/redacted_send/redacted_panic.ksh \
 	functional/redacted_send/redacted_props.ksh \
 	functional/redacted_send/redacted_resume.ksh \
 	functional/redacted_send/redacted_size.ksh \
 	functional/redacted_send/redacted_volume.ksh \
 	functional/redacted_send/setup.ksh \
 	functional/redundancy/cleanup.ksh \
 	functional/redundancy/redundancy_draid1.ksh \
 	functional/redundancy/redundancy_draid2.ksh \
 	functional/redundancy/redundancy_draid3.ksh \
 	functional/redundancy/redundancy_draid_damaged1.ksh \
 	functional/redundancy/redundancy_draid_damaged2.ksh \
 	functional/redundancy/redundancy_draid.ksh \
 	functional/redundancy/redundancy_draid_spare1.ksh \
 	functional/redundancy/redundancy_draid_spare2.ksh \
 	functional/redundancy/redundancy_draid_spare3.ksh \
 	functional/redundancy/redundancy_mirror.ksh \
 	functional/redundancy/redundancy_raidz1.ksh \
 	functional/redundancy/redundancy_raidz2.ksh \
 	functional/redundancy/redundancy_raidz3.ksh \
 	functional/redundancy/redundancy_raidz.ksh \
 	functional/redundancy/redundancy_stripe.ksh \
 	functional/redundancy/setup.ksh \
 	functional/refquota/cleanup.ksh \
 	functional/refquota/refquota_001_pos.ksh \
 	functional/refquota/refquota_002_pos.ksh \
 	functional/refquota/refquota_003_pos.ksh \
 	functional/refquota/refquota_004_pos.ksh \
 	functional/refquota/refquota_005_pos.ksh \
 	functional/refquota/refquota_006_neg.ksh \
 	functional/refquota/refquota_007_neg.ksh \
 	functional/refquota/refquota_008_neg.ksh \
 	functional/refquota/setup.ksh \
 	functional/refreserv/cleanup.ksh \
 	functional/refreserv/refreserv_001_pos.ksh \
 	functional/refreserv/refreserv_002_pos.ksh \
 	functional/refreserv/refreserv_003_pos.ksh \
 	functional/refreserv/refreserv_004_pos.ksh \
 	functional/refreserv/refreserv_005_pos.ksh \
 	functional/refreserv/refreserv_multi_raidz.ksh \
 	functional/refreserv/refreserv_raidz.ksh \
 	functional/refreserv/setup.ksh \
 	functional/removal/cleanup.ksh \
 	functional/removal/removal_all_vdev.ksh \
 	functional/removal/removal_cancel.ksh \
 	functional/removal/removal_check_space.ksh \
 	functional/removal/removal_condense_export.ksh \
 	functional/removal/removal_multiple_indirection.ksh \
 	functional/removal/removal_nopwrite.ksh \
 	functional/removal/removal_remap_deadlists.ksh \
 	functional/removal/removal_reservation.ksh \
 	functional/removal/removal_resume_export.ksh \
 	functional/removal/removal_sanity.ksh \
 	functional/removal/removal_with_add.ksh \
 	functional/removal/removal_with_create_fs.ksh \
 	functional/removal/removal_with_dedup.ksh \
 	functional/removal/removal_with_errors.ksh \
 	functional/removal/removal_with_export.ksh \
 	functional/removal/removal_with_faulted.ksh \
 	functional/removal/removal_with_ganging.ksh \
 	functional/removal/removal_with_hole.ksh \
 	functional/removal/removal_with_indirect.ksh \
 	functional/removal/removal_with_remove.ksh \
 	functional/removal/removal_with_scrub.ksh \
 	functional/removal/removal_with_send.ksh \
 	functional/removal/removal_with_send_recv.ksh \
 	functional/removal/removal_with_snapshot.ksh \
 	functional/removal/removal_with_write.ksh \
 	functional/removal/removal_with_zdb.ksh \
 	functional/removal/remove_attach_mirror.ksh \
 	functional/removal/remove_expanded.ksh \
 	functional/removal/remove_indirect.ksh \
 	functional/removal/remove_mirror.ksh \
 	functional/removal/remove_mirror_sanity.ksh \
 	functional/removal/remove_raidz.ksh \
 	functional/rename_dirs/cleanup.ksh \
 	functional/rename_dirs/rename_dirs_001_pos.ksh \
 	functional/rename_dirs/setup.ksh \
 	functional/renameat2/cleanup.ksh \
 	functional/renameat2/setup.ksh \
 	functional/renameat2/renameat2_exchange.ksh \
 	functional/renameat2/renameat2_noreplace.ksh \
 	functional/renameat2/renameat2_whiteout.ksh \
 	functional/replacement/attach_import.ksh \
 	functional/replacement/attach_multiple.ksh \
 	functional/replacement/attach_rebuild.ksh \
 	functional/replacement/attach_resilver.ksh \
 	functional/replacement/cleanup.ksh \
 	functional/replacement/detach.ksh \
 	functional/replacement/rebuild_disabled_feature.ksh \
 	functional/replacement/rebuild_multiple.ksh \
 	functional/replacement/rebuild_raidz.ksh \
 	functional/replacement/replace_import.ksh \
 	functional/replacement/replace_rebuild.ksh \
 	functional/replacement/replace_resilver.ksh \
 	functional/replacement/resilver_restart_001.ksh \
 	functional/replacement/resilver_restart_002.ksh \
 	functional/replacement/scrub_cancel.ksh \
 	functional/replacement/setup.ksh \
 	functional/reservation/cleanup.ksh \
 	functional/reservation/reservation_001_pos.ksh \
 	functional/reservation/reservation_002_pos.ksh \
 	functional/reservation/reservation_003_pos.ksh \
 	functional/reservation/reservation_004_pos.ksh \
 	functional/reservation/reservation_005_pos.ksh \
 	functional/reservation/reservation_006_pos.ksh \
 	functional/reservation/reservation_007_pos.ksh \
 	functional/reservation/reservation_008_pos.ksh \
 	functional/reservation/reservation_009_pos.ksh \
 	functional/reservation/reservation_010_pos.ksh \
 	functional/reservation/reservation_011_pos.ksh \
 	functional/reservation/reservation_012_pos.ksh \
 	functional/reservation/reservation_013_pos.ksh \
 	functional/reservation/reservation_014_pos.ksh \
 	functional/reservation/reservation_015_pos.ksh \
 	functional/reservation/reservation_016_pos.ksh \
 	functional/reservation/reservation_017_pos.ksh \
 	functional/reservation/reservation_018_pos.ksh \
 	functional/reservation/reservation_019_pos.ksh \
 	functional/reservation/reservation_020_pos.ksh \
 	functional/reservation/reservation_021_neg.ksh \
 	functional/reservation/reservation_022_pos.ksh \
 	functional/reservation/setup.ksh \
 	functional/rootpool/cleanup.ksh \
 	functional/rootpool/rootpool_002_neg.ksh \
 	functional/rootpool/rootpool_003_neg.ksh \
 	functional/rootpool/rootpool_007_pos.ksh \
 	functional/rootpool/setup.ksh \
 	functional/rsend/cleanup.ksh \
 	functional/rsend/recv_dedup_encrypted_zvol.ksh \
 	functional/rsend/recv_dedup.ksh \
 	functional/rsend/rsend_001_pos.ksh \
 	functional/rsend/rsend_002_pos.ksh \
 	functional/rsend/rsend_003_pos.ksh \
 	functional/rsend/rsend_004_pos.ksh \
 	functional/rsend/rsend_005_pos.ksh \
 	functional/rsend/rsend_006_pos.ksh \
 	functional/rsend/rsend_007_pos.ksh \
 	functional/rsend/rsend_008_pos.ksh \
 	functional/rsend/rsend_009_pos.ksh \
 	functional/rsend/rsend_010_pos.ksh \
 	functional/rsend/rsend_011_pos.ksh \
 	functional/rsend/rsend_012_pos.ksh \
 	functional/rsend/rsend_013_pos.ksh \
 	functional/rsend/rsend_014_pos.ksh \
 	functional/rsend/rsend_016_neg.ksh \
 	functional/rsend/rsend_019_pos.ksh \
 	functional/rsend/rsend_020_pos.ksh \
 	functional/rsend/rsend_021_pos.ksh \
 	functional/rsend/rsend_022_pos.ksh \
 	functional/rsend/rsend_024_pos.ksh \
 	functional/rsend/rsend_025_pos.ksh \
 	functional/rsend/rsend_026_neg.ksh \
 	functional/rsend/rsend_027_pos.ksh \
 	functional/rsend/rsend_028_neg.ksh \
 	functional/rsend/rsend_029_neg.ksh \
 	functional/rsend/rsend_030_pos.ksh \
 	functional/rsend/rsend_031_pos.ksh \
 	functional/rsend/send-c_embedded_blocks.ksh \
 	functional/rsend/send-c_incremental.ksh \
 	functional/rsend/send-c_longname.ksh \
 	functional/rsend/send-c_lz4_disabled.ksh \
 	functional/rsend/send-c_mixed_compression.ksh \
 	functional/rsend/send-c_props.ksh \
 	functional/rsend/send-c_recv_dedup.ksh \
 	functional/rsend/send-c_recv_lz4_disabled.ksh \
 	functional/rsend/send-c_resume.ksh \
 	functional/rsend/send-c_stream_size_estimate.ksh \
 	functional/rsend/send-c_verify_contents.ksh \
 	functional/rsend/send-c_verify_ratio.ksh \
 	functional/rsend/send-c_volume.ksh \
 	functional/rsend/send-c_zstream_recompress.ksh \
 	functional/rsend/send-c_zstreamdump.ksh \
 	functional/rsend/send-cpL_varied_recsize.ksh \
 	functional/rsend/send_doall.ksh \
 	functional/rsend/send_encrypted_incremental.ksh \
 	functional/rsend/send_encrypted_files.ksh \
 	functional/rsend/send_encrypted_freeobjects.ksh \
 	functional/rsend/send_encrypted_hierarchy.ksh \
 	functional/rsend/send_encrypted_props.ksh \
 	functional/rsend/send_encrypted_truncated_files.ksh \
 	functional/rsend/send_freeobjects.ksh \
 	functional/rsend/send_holds.ksh \
 	functional/rsend/send_hole_birth.ksh \
 	functional/rsend/send_invalid.ksh \
 	functional/rsend/send_leak_keymaps.ksh \
 	functional/rsend/send-L_toggle.ksh \
 	functional/rsend/send_mixed_raw.ksh \
 	functional/rsend/send_partial_dataset.ksh \
 	functional/rsend/send_raw_ashift.ksh \
 	functional/rsend/send_raw_spill_block.ksh \
 	functional/rsend/send_raw_large_blocks.ksh \
 	functional/rsend/send_realloc_dnode_size.ksh \
 	functional/rsend/send_realloc_encrypted_files.ksh \
 	functional/rsend/send_realloc_files.ksh \
 	functional/rsend/send_spill_block.ksh \
 	functional/rsend/send-wR_encrypted_zvol.ksh \
 	functional/rsend/setup.ksh \
 	functional/scrub_mirror/cleanup.ksh \
 	functional/scrub_mirror/scrub_mirror_001_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_002_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_003_pos.ksh \
 	functional/scrub_mirror/scrub_mirror_004_pos.ksh \
 	functional/scrub_mirror/setup.ksh \
 	functional/slog/cleanup.ksh \
 	functional/slog/setup.ksh \
 	functional/slog/slog_001_pos.ksh \
 	functional/slog/slog_002_pos.ksh \
 	functional/slog/slog_003_pos.ksh \
 	functional/slog/slog_004_pos.ksh \
 	functional/slog/slog_005_pos.ksh \
 	functional/slog/slog_006_pos.ksh \
 	functional/slog/slog_007_pos.ksh \
 	functional/slog/slog_008_neg.ksh \
 	functional/slog/slog_009_neg.ksh \
 	functional/slog/slog_010_neg.ksh \
 	functional/slog/slog_011_neg.ksh \
 	functional/slog/slog_012_neg.ksh \
 	functional/slog/slog_013_pos.ksh \
 	functional/slog/slog_014_pos.ksh \
 	functional/slog/slog_015_neg.ksh \
 	functional/slog/slog_016_pos.ksh \
 	functional/slog/slog_replay_fs_001.ksh \
 	functional/slog/slog_replay_fs_002.ksh \
 	functional/slog/slog_replay_volume.ksh \
 	functional/snapshot/cleanup.ksh \
 	functional/snapshot/clone_001_pos.ksh \
 	functional/snapshot/rollback_001_pos.ksh \
 	functional/snapshot/rollback_002_pos.ksh \
 	functional/snapshot/rollback_003_pos.ksh \
 	functional/snapshot/setup.ksh \
 	functional/snapshot/snapshot_001_pos.ksh \
 	functional/snapshot/snapshot_002_pos.ksh \
 	functional/snapshot/snapshot_003_pos.ksh \
 	functional/snapshot/snapshot_004_pos.ksh \
 	functional/snapshot/snapshot_005_pos.ksh \
 	functional/snapshot/snapshot_006_pos.ksh \
 	functional/snapshot/snapshot_007_pos.ksh \
 	functional/snapshot/snapshot_008_pos.ksh \
 	functional/snapshot/snapshot_009_pos.ksh \
 	functional/snapshot/snapshot_010_pos.ksh \
 	functional/snapshot/snapshot_011_pos.ksh \
 	functional/snapshot/snapshot_012_pos.ksh \
 	functional/snapshot/snapshot_013_pos.ksh \
 	functional/snapshot/snapshot_014_pos.ksh \
 	functional/snapshot/snapshot_015_pos.ksh \
 	functional/snapshot/snapshot_016_pos.ksh \
 	functional/snapshot/snapshot_017_pos.ksh \
 	functional/snapshot/snapshot_018_pos.ksh \
 	functional/snapused/cleanup.ksh \
 	functional/snapused/setup.ksh \
 	functional/snapused/snapused_001_pos.ksh \
 	functional/snapused/snapused_002_pos.ksh \
 	functional/snapused/snapused_003_pos.ksh \
 	functional/snapused/snapused_004_pos.ksh \
 	functional/snapused/snapused_005_pos.ksh \
 	functional/sparse/cleanup.ksh \
 	functional/sparse/setup.ksh \
 	functional/sparse/sparse_001_pos.ksh \
 	functional/stat/cleanup.ksh \
 	functional/stat/setup.ksh \
 	functional/stat/stat_001_pos.ksh \
 	functional/stat/statx_dioalign.ksh \
 	functional/syncfs/syncfs_suspend.ksh \
 	functional/suid/cleanup.ksh \
 	functional/suid/setup.ksh \
 	functional/suid/suid_write_to_none.ksh \
 	functional/suid/suid_write_to_sgid.ksh \
 	functional/suid/suid_write_to_suid.ksh \
 	functional/suid/suid_write_to_suid_sgid.ksh \
 	functional/suid/suid_write_zil_replay.ksh \
 	functional/trim/autotrim_config.ksh \
 	functional/trim/autotrim_integrity.ksh \
 	functional/trim/autotrim_trim_integrity.ksh \
 	functional/trim/cleanup.ksh \
 	functional/trim/setup.ksh \
 	functional/trim/trim_config.ksh \
 	functional/trim/trim_integrity.ksh \
 	functional/trim/trim_l2arc.ksh \
 	functional/truncate/cleanup.ksh \
 	functional/truncate/setup.ksh \
 	functional/truncate/truncate_001_pos.ksh \
 	functional/truncate/truncate_002_pos.ksh \
 	functional/truncate/truncate_timestamps.ksh \
 	functional/upgrade/cleanup.ksh \
 	functional/upgrade/setup.ksh \
 	functional/upgrade/upgrade_projectquota_001_pos.ksh \
 	functional/upgrade/upgrade_projectquota_002_pos.ksh \
 	functional/upgrade/upgrade_readonly_pool.ksh \
 	functional/upgrade/upgrade_userobj_001_pos.ksh \
 	functional/user_namespace/cleanup.ksh \
 	functional/user_namespace/setup.ksh \
 	functional/user_namespace/user_namespace_001.ksh \
 	functional/user_namespace/user_namespace_002.ksh \
 	functional/user_namespace/user_namespace_003.ksh \
 	functional/user_namespace/user_namespace_004.ksh \
 	functional/userquota/cleanup.ksh \
 	functional/userquota/groupspace_001_pos.ksh \
 	functional/userquota/groupspace_002_pos.ksh \
 	functional/userquota/groupspace_003_pos.ksh \
 	functional/userquota/groupspace_004_pos.ksh \
 	functional/userquota/setup.ksh \
 	functional/userquota/defaultuserquota_001_pos.ksh \
 	functional/userquota/defaultuserquota_002_pos.ksh \
 	functional/userquota/defaultuserquota_003_pos.ksh \
 	functional/userquota/defaultuserquota_004_neg.ksh \
 	functional/userquota/defaultuserquota_005_pos.ksh \
 	functional/userquota/defaultuserquota_006_pos.ksh \
 	functional/userquota/defaultuserquota_007_pos.ksh \
 	functional/userquota/defaultuserquota_008_pos.ksh \
 	functional/userquota/defaultuserquota_009_pos.ksh \
 	functional/userquota/defaultuserquota_010_neg.ksh \
 	functional/userquota/defaultuserquota_011_neg.ksh \
 	functional/userquota/defaultuserquota_012_neg.ksh \
 	functional/userquota/defaultuserquota_013_neg.ksh \
 	functional/userquota/userquota_001_pos.ksh \
 	functional/userquota/userquota_002_pos.ksh \
 	functional/userquota/userquota_003_pos.ksh \
 	functional/userquota/userquota_004_pos.ksh \
 	functional/userquota/userquota_005_neg.ksh \
 	functional/userquota/userquota_006_pos.ksh \
 	functional/userquota/userquota_007_pos.ksh \
 	functional/userquota/userquota_008_pos.ksh \
 	functional/userquota/userquota_009_pos.ksh \
 	functional/userquota/userquota_010_pos.ksh \
 	functional/userquota/userquota_011_pos.ksh \
 	functional/userquota/userquota_012_neg.ksh \
 	functional/userquota/userquota_013_pos.ksh \
 	functional/userquota/userspace_001_pos.ksh \
 	functional/userquota/userspace_002_pos.ksh \
 	functional/userquota/userspace_003_pos.ksh \
 	functional/userquota/userspace_004_pos.ksh \
 	functional/userquota/userspace_encrypted.ksh \
 	functional/userquota/userspace_send_encrypted.ksh \
 	functional/userquota/userspace_encrypted_13709.ksh \
 	functional/vdev_zaps/cleanup.ksh \
 	functional/vdev_zaps/setup.ksh \
 	functional/vdev_zaps/vdev_zaps_001_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_002_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_003_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_004_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_005_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_006_pos.ksh \
 	functional/vdev_zaps/vdev_zaps_007_pos.ksh \
 	functional/write_dirs/cleanup.ksh \
 	functional/write_dirs/setup.ksh \
 	functional/write_dirs/write_dirs_001_pos.ksh \
 	functional/write_dirs/write_dirs_002_pos.ksh \
 	functional/xattr/cleanup.ksh \
 	functional/xattr/setup.ksh \
 	functional/xattr/xattr_001_pos.ksh \
 	functional/xattr/xattr_002_neg.ksh \
 	functional/xattr/xattr_003_neg.ksh \
 	functional/xattr/xattr_004_pos.ksh \
 	functional/xattr/xattr_005_pos.ksh \
 	functional/xattr/xattr_006_pos.ksh \
 	functional/xattr/xattr_007_neg.ksh \
 	functional/xattr/xattr_008_pos.ksh \
 	functional/xattr/xattr_009_neg.ksh \
 	functional/xattr/xattr_010_neg.ksh \
 	functional/xattr/xattr_011_pos.ksh \
 	functional/xattr/xattr_012_pos.ksh \
 	functional/xattr/xattr_013_pos.ksh \
 	functional/xattr/xattr_compat.ksh \
 	functional/zap_shrink/cleanup.ksh \
 	functional/zap_shrink/zap_shrink_001_pos.ksh \
 	functional/zap_shrink/setup.ksh \
 	functional/zpool_influxdb/cleanup.ksh \
 	functional/zpool_influxdb/setup.ksh \
 	functional/zpool_influxdb/zpool_influxdb.ksh \
 	functional/zvol/zvol_cli/cleanup.ksh \
 	functional/zvol/zvol_cli/setup.ksh \
 	functional/zvol/zvol_cli/zvol_cli_001_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_002_pos.ksh \
 	functional/zvol/zvol_cli/zvol_cli_003_neg.ksh \
 	functional/zvol/zvol_ENOSPC/cleanup.ksh \
 	functional/zvol/zvol_ENOSPC/setup.ksh \
 	functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh \
 	functional/zvol/zvol_misc/cleanup.ksh \
 	functional/zvol/zvol_misc/setup.ksh \
 	functional/zvol/zvol_misc/zvol_misc_001_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_002_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_003_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_004_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_005_neg.ksh \
 	functional/zvol/zvol_misc/zvol_misc_006_pos.ksh \
 	functional/zvol/zvol_misc/zvol_misc_fua.ksh \
 	functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh \
 	functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh \
 	functional/zvol/zvol_misc/zvol_misc_snapdev.ksh \
 	functional/zvol/zvol_misc/zvol_misc_trim.ksh \
 	functional/zvol/zvol_misc/zvol_misc_volmode.ksh \
 	functional/zvol/zvol_misc/zvol_misc_zil.ksh \
 	functional/zvol/zvol_stress/cleanup.ksh \
 	functional/zvol/zvol_stress/setup.ksh \
 	functional/zvol/zvol_stress/zvol_stress.ksh \
 	functional/zvol/zvol_stress/zvol_stress_destroy.ksh \
 	functional/zvol/zvol_swap/cleanup.ksh \
 	functional/zvol/zvol_swap/setup.ksh \
 	functional/zvol/zvol_swap/zvol_swap_001_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_002_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_003_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_004_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_005_pos.ksh \
 	functional/zvol/zvol_swap/zvol_swap_006_pos.ksh \
 	functional/idmap_mount/cleanup.ksh \
 	functional/idmap_mount/setup.ksh \
 	functional/idmap_mount/idmap_mount_001.ksh \
 	functional/idmap_mount/idmap_mount_002.ksh \
 	functional/idmap_mount/idmap_mount_003.ksh \
 	functional/idmap_mount/idmap_mount_004.ksh \
 	functional/idmap_mount/idmap_mount_005.ksh
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh
index 1b3310edb98b..45b041503e22 100755
--- a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/fault/fault_limits.ksh
@@ -1,97 +1,97 @@
 #!/bin/ksh -p
 # SPDX-License-Identifier: CDDL-1.0
 #
 # CDDL HEADER START
 #
 # The contents of this file are subject to the terms of the
 # Common Development and Distribution License (the "License").
 # You may not use this file except in compliance with the License.
 #
 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 # or https://opensource.org/licenses/CDDL-1.0.
 # See the License for the specific language governing permissions
 # and limitations under the License.
 #
 # When distributing Covered Code, include this CDDL HEADER in each
 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 # If applicable, add the following below this CDDL HEADER, with the
 # fields enclosed by brackets "[]" replaced with your own identifying
 # information: Portions Copyright [yyyy] [name of copyright owner]
 #
 # CDDL HEADER END
 #
 #
 # Copyright (c) 2024 by Klara, Inc.
 #
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/tests/functional/fault/fault.cfg
 
 #
 # DESCRIPTION: Verify that raidz children vdev fault count is restricted
 #
 # STRATEGY:
 # 1. Create a raidz2 or raidz3 pool and add some data to it
 # 2. Replace one of the child vdevs to create a replacing vdev
 # 3. While it is resilvering, attempt to fault disks
 # 4. Verify that less than parity count was faulted while replacing
 #
 
 TESTPOOL="fault-test-pool"
 PARITY=$((RANDOM%(2) + 2))
 VDEV_CNT=$((4 + (2 * PARITY)))
 VDEV_SIZ=512M
 
 function cleanup
 {
 	poolexists "$TESTPOOL" && log_must_busy zpool destroy "$TESTPOOL"
 
 	for i in {0..$((VDEV_CNT - 1))}; do
 		log_must rm -f "$TEST_BASE_DIR/dev-$i"
 	done
 }
 
 log_onexit cleanup
 log_assert "restricts raidz children vdev fault count"
 
 log_note "creating $VDEV_CNT vdevs for parity $PARITY test"
 typeset -a disks
 for i in {0..$((VDEV_CNT - 1))}; do
 	device=$TEST_BASE_DIR/dev-$i
 	log_must truncate -s $VDEV_SIZ $device
 	disks[${#disks[*]}+1]=$device
 done
 
 log_must zpool create -f ${TESTPOOL} raidz${PARITY} ${disks[1..$((VDEV_CNT - 1))]}
 
 # Add some data to the pool
 log_must zfs create $TESTPOOL/fs
 MNTPOINT="$(get_prop mountpoint $TESTPOOL/fs)"
-log_must fill_fs $MNTPOINT $PARITY 200 32768 1000 Z
+log_must fill_fs $MNTPOINT $PARITY 200 32768 100 R
 sync_pool $TESTPOOL
 
 # Replace the last child vdev to form a replacing vdev
 log_must zpool replace ${TESTPOOL} ${disks[$((VDEV_CNT - 1))]} ${disks[$VDEV_CNT]}
 # imediately offline replacement disk to keep replacing vdev around
 log_must zpool offline ${TESTPOOL} ${disks[$VDEV_CNT]}
 
 # Fault disks while a replacing vdev is still active
 for disk in ${disks[0..$PARITY]}; do
 	log_must zpool offline -tf ${TESTPOOL} $disk
 done
 
 zpool status $TESTPOOL
 
 # Count the faults that succeeded
 faults=0
 for disk in ${disks[0..$PARITY]}; do
 	state=$(zpool get -H -o value state ${TESTPOOL} ${disk})
 	if [ "$state" = "FAULTED" ] ; then
 		((faults=faults+1))
 	fi
 done
 
 log_must test "$faults" -lt "$PARITY"
 log_must test "$faults" -gt 0
 
 log_pass "restricts raidz children vdev fault count"
diff --git a/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mount/mount_loopback.ksh b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mount/mount_loopback.ksh
new file mode 100755
index 000000000000..86adef7ea032
--- /dev/null
+++ b/sys/contrib/openzfs/tests/zfs-tests/tests/functional/mount/mount_loopback.ksh
@@ -0,0 +1,111 @@
+#!/bin/ksh -p
+# SPDX-License-Identifier: CDDL-1.0
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or https://opensource.org/licenses/CDDL-1.0.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+# Copyright (c) 2025 by Lawrence Livermore National Security, LLC.
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Verify that we can make an xfs filesystem on a ZFS-backed loopback device.
+#
+# See:
+# https://github.com/openzfs/zfs/pull/17298
+# https://github.com/openzfs/zfs/issues/17277
+#
+# STRATEGY:
+# 1. Make a pool
+# 2. Make a file on the pool or create zvol
+# 3. Mount the file/zvol behind a loopback device
+# 4. Create & mount an xfs filesystem on the loopback device
+
+function cleanup
+{
+	if [ -d $TEST_BASE_DIR/mnt ] ; then
+		umount $TEST_BASE_DIR/mnt
+		log_must rmdir $TEST_BASE_DIR/mnt
+	fi
+	if [ -n "$DEV" ] ; then
+		log_must losetup -d $DEV
+	fi
+	destroy_pool $TESTPOOL2
+	log_must rm -f $TEST_BASE_DIR/file1
+}
+
+if [ ! -x "$(which mkfs.xfs)" ] ; then
+	log_unsupported "No mkfs.xfs binary"
+fi
+
+if [ ! -d /lib/modules/$(uname -r)/kernel/fs/xfs ] && \
+     ! grep -qE '\sxfs$' /proc/filesystems ; then
+	log_unsupported "No XFS kernel support"
+fi
+
+log_assert "Make an xfs filesystem on a ZFS-backed loopback device"
+log_onexit cleanup
+
+# fio options
+export NUMJOBS=2
+export RUNTIME=3
+export PERF_RANDSEED=1234
+export PERF_COMPPERCENT=66
+export PERF_COMPCHUNK=0
+export BLOCKSIZE=128K
+export SYNC_TYPE=0
+export FILE_SIZE=$(( 1024 * 1024 ))
+
+function do_test
+{
+	imgfile=$1
+	log_note "Running test on $imgfile"
+	log_must losetup -f $imgfile
+	DEV=$(losetup --associated $imgfile | grep -Eo '^/dev/loop[0-9]+')
+	log_must mkfs.xfs $DEV
+	mkdir $TEST_BASE_DIR/mnt
+	log_must mount $DEV $TEST_BASE_DIR/mnt
+	export DIRECTORY=$TEST_BASE_DIR/mnt
+
+	for d in 0 1 ; do
+		# fio options
+		export DIRECT=$d
+		log_must fio $FIO_SCRIPTS/mkfiles.fio
+		log_must fio $FIO_SCRIPTS/random_reads.fio
+	done
+	log_must umount $TEST_BASE_DIR/mnt
+	log_must rmdir $TEST_BASE_DIR/mnt
+	log_must losetup -d $DEV
+	DEV=""
+}
+
+log_must truncate -s 1G $TEST_BASE_DIR/file1
+log_must zpool create $TESTPOOL2 $TEST_BASE_DIR/file1
+log_must truncate -s 512M /$TESTPOOL2/img
+do_test /$TESTPOOL2/img
+log_must rm /$TESTPOOL2/img
+log_must zfs create -V 512M $TESTPOOL2/vol
+
+blkdev="$ZVOL_DEVDIR/$TESTPOOL2/vol"
+block_device_wait $blkdev
+do_test $blkdev
+
+log_pass "Verified xfs filesystem on a ZFS-backed loopback device"
diff --git a/sys/modules/zfs/zfs_config.h b/sys/modules/zfs/zfs_config.h
index 72167b752e53..8b3e64eba2f3 100644
--- a/sys/modules/zfs/zfs_config.h
+++ b/sys/modules/zfs/zfs_config.h
@@ -1,879 +1,879 @@
 /*
  */
 
 /* zfs_config.h.  Generated from zfs_config.h.in by configure.  */
 /* zfs_config.h.in.  Generated from configure.ac by autoheader.  */
 
 /* Define to 1 if translation of program messages to the user's native
    language is requested. */
 /* #undef ENABLE_NLS */
 
 /* __assign_str() has one arg */
 /* #undef HAVE_1ARG_ASSIGN_STR */
 
 /* lookup_bdev() wants 1 arg */
 /* #undef HAVE_1ARG_LOOKUP_BDEV */
 
 /* kernel has access_ok with 'type' parameter */
 /* #undef HAVE_ACCESS_OK_TYPE */
 
 /* add_disk() returns int */
 /* #undef HAVE_ADD_DISK_RET */
 
 /* Define if host toolchain supports AES */
 #define HAVE_AES 1
 
 /* Define if you have [rt] */
 #define HAVE_AIO_H 1
 
 #ifdef __amd64__
 #ifndef RESCUE
 /* Define if host toolchain supports AVX */
 #define HAVE_AVX 1
 #endif
 
 /* Define if host toolchain supports AVX2 */
 #define HAVE_AVX2 1
 
 /* Define if host toolchain supports AVX512BW */
 #define HAVE_AVX512BW 1
 
 /* Define if host toolchain supports AVX512CD */
 #define HAVE_AVX512CD 1
 
 /* Define if host toolchain supports AVX512DQ */
 #define HAVE_AVX512DQ 1
 
 /* Define if host toolchain supports AVX512ER */
 #define HAVE_AVX512ER 1
 
 /* Define if host toolchain supports AVX512F */
 #define HAVE_AVX512F 1
 
 /* Define if host toolchain supports AVX512IFMA */
 #define HAVE_AVX512IFMA 1
 
 /* Define if host toolchain supports AVX512PF */
 #define HAVE_AVX512PF 1
 
 /* Define if host toolchain supports AVX512VBMI */
 #define HAVE_AVX512VBMI 1
 
 /* Define if host toolchain supports AVX512VL */
 #define HAVE_AVX512VL 1
 #endif
 
 /* backtrace() is available */
 /* #undef HAVE_BACKTRACE */
 
 /* bdevname() is available */
 /* #undef HAVE_BDEVNAME */
 
 /* bdev_check_media_change() exists */
 /* #undef HAVE_BDEV_CHECK_MEDIA_CHANGE */
 
 /* bdev_file_open_by_path() exists */
 /* #undef HAVE_BDEV_FILE_OPEN_BY_PATH */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_63 */
 
 /* bdev_*_io_acct() available */
 /* #undef HAVE_BDEV_IO_ACCT_OLD */
 
 /* bdev_kobj() exists */
 /* #undef HAVE_BDEV_KOBJ */
 
 /* bdev_max_discard_sectors() is available */
 /* #undef HAVE_BDEV_MAX_DISCARD_SECTORS */
 
 /* bdev_max_secure_erase_sectors() is available */
 /* #undef HAVE_BDEV_MAX_SECURE_ERASE_SECTORS */
 
 /* bdev_nr_bytes() is available */
 /* #undef HAVE_BDEV_NR_BYTES */
 
 /* bdev_open_by_path() exists */
 /* #undef HAVE_BDEV_OPEN_BY_PATH */
 
 /* bdev_release() exists */
 /* #undef HAVE_BDEV_RELEASE */
 
 /* block_device_operations->submit_bio() returns void */
 /* #undef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID */
 
 /* bdev_whole() is available */
 /* #undef HAVE_BDEV_WHOLE */
 
 /* bio_alloc() takes 4 arguments */
 /* #undef HAVE_BIO_ALLOC_4ARG */
 
 /* bio->bi_bdev->bd_disk exists */
 /* #undef HAVE_BIO_BDEV_DISK */
 
 /* bio_*_io_acct() available */
 /* #undef HAVE_BIO_IO_ACCT */
 
 /* bio_max_segs() is implemented */
 /* #undef HAVE_BIO_MAX_SEGS */
 
 /* bio_set_dev() GPL-only */
 /* #undef HAVE_BIO_SET_DEV_GPL_ONLY */
 
 /* bio_set_dev() is a macro */
 /* #undef HAVE_BIO_SET_DEV_MACRO */
 
 /* bio_set_op_attrs is available */
 /* #undef HAVE_BIO_SET_OP_ATTRS */
 
 /* blkdev_get_by_path() exists and takes 4 args */
 /* #undef HAVE_BLKDEV_GET_BY_PATH_4ARG */
 
 /* blkdev_get_by_path() handles ERESTARTSYS */
 /* #undef HAVE_BLKDEV_GET_ERESTARTSYS */
 
 /* __blkdev_issue_discard(flags) is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_FLAGS */
 
 /* __blkdev_issue_discard() is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_ASYNC_NOFLAGS */
 
 /* blkdev_issue_discard(flags) is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_FLAGS */
 
 /* blkdev_issue_discard() is available */
 /* #undef HAVE_BLKDEV_ISSUE_DISCARD_NOFLAGS */
 
 /* blkdev_issue_secure_erase() is available */
 /* #undef HAVE_BLKDEV_ISSUE_SECURE_ERASE */
 
 /* blkdev_put() exists */
 /* #undef HAVE_BLKDEV_PUT */
 
 /* blkdev_put() accepts void* as arg 2 */
 /* #undef HAVE_BLKDEV_PUT_HOLDER */
 
 /* struct queue_limits has a features field */
 /* #undef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES */
 
 /* blkdev_reread_part() exists */
 /* #undef HAVE_BLKDEV_REREAD_PART */
 
 /* blkg_tryget() is available */
 /* #undef HAVE_BLKG_TRYGET */
 
 /* blkg_tryget() GPL-only */
 /* #undef HAVE_BLKG_TRYGET_GPL_ONLY */
 
 /* blk_alloc_disk() exists */
 /* #undef HAVE_BLK_ALLOC_DISK */
 
 /* blk_alloc_disk() exists and takes 2 args */
 /* #undef HAVE_BLK_ALLOC_DISK_2ARG */
 
 /* blk_alloc_queue() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN */
 
 /* blk_alloc_queue_rh() expects request function */
 /* #undef HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH */
 
 /* blk_cleanup_disk() exists */
 /* #undef HAVE_BLK_CLEANUP_DISK */
 
 /* blk_mode_t is defined */
 /* #undef HAVE_BLK_MODE_T */
 
 /* block multiqueue hardware context is cached in struct request */
 /* #undef HAVE_BLK_MQ_RQ_HCTX */
 
 /* blk queue backing_dev_info is dynamic */
 /* #undef HAVE_BLK_QUEUE_BDI_DYNAMIC */
 
 /* blk_queue_discard() is available */
 /* #undef HAVE_BLK_QUEUE_DISCARD */
 
 /* backing_dev_info is available through queue gendisk */
 /* #undef HAVE_BLK_QUEUE_DISK_BDI */
 
 /* blk_queue_secure_erase() is available */
 /* #undef HAVE_BLK_QUEUE_SECURE_ERASE */
 
 /* blk_queue_update_readahead() exists */
 /* #undef HAVE_BLK_QUEUE_UPDATE_READAHEAD */
 
 /* BLK_STS_RESV_CONFLICT is defined */
 /* #undef HAVE_BLK_STS_RESV_CONFLICT */
 
 /* Define if release() in block_device_operations takes 1 arg */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG */
 
 /* Define if revalidate_disk() in block_device_operations */
 /* #undef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK */
 
 /* Define to 1 if you have the Mac OS X function
    CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */
 /* #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES */
 
 /* Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in
    the CoreFoundation framework. */
 /* #undef HAVE_CFPREFERENCESCOPYAPPVALUE */
 
 /* check_disk_change() exists */
 /* #undef HAVE_CHECK_DISK_CHANGE */
 
 /* copy_splice_read exists */
 /* #undef HAVE_COPY_SPLICE_READ */
 
 /* cpu_has_feature() is GPL-only */
 /* #undef HAVE_CPU_HAS_FEATURE_GPL_ONLY */
 
 /* Define if the GNU dcgettext() function is already present or preinstalled.
    */
 /* #undef HAVE_DCGETTEXT */
 
 /* DECLARE_EVENT_CLASS() is available */
 /* #undef HAVE_DECLARE_EVENT_CLASS */
 
 /* 3-arg dequeue_signal() takes a type argument */
 /* #undef HAVE_DEQUEUE_SIGNAL_3ARG_TYPE */
 
 /* dequeue_signal() takes 4 arguments */
 /* #undef HAVE_DEQUEUE_SIGNAL_4ARG */
 
 /* lookup_bdev() wants dev_t arg */
 /* #undef HAVE_DEVT_LOOKUP_BDEV */
 
 /* disk_check_media_change() exists */
 /* #undef HAVE_DISK_CHECK_MEDIA_CHANGE */
 
 /* disk_*_io_acct() available */
 /* #undef HAVE_DISK_IO_ACCT */
 
 /* disk_update_readahead() exists */
 /* #undef HAVE_DISK_UPDATE_READAHEAD */
 
 /* Define to 1 if you have the <dlfcn.h> header file. */
 #define HAVE_DLFCN_H 1
 
 /* dops->d_revalidate() takes 4 args */
 /* #undef HAVE_D_REVALIDATE_4ARGS */
 
+/* Define if d_set_d_op() is available */
+/* #undef HAVE_D_SET_D_OP */
+
 /* Define to 1 if you have the 'execvpe' function. */
 #define HAVE_EXECVPE 1
 
 /* fault_in_iov_iter_readable() is available */
 /* #undef HAVE_FAULT_IN_IOV_ITER_READABLE */
 
 /* file->f_version exists */
 /* #undef HAVE_FILE_F_VERSION */
 
 /* flush_dcache_page() is GPL-only */
 /* #undef HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY */
 
 /* Define if compiler supports -Wformat-overflow */
 /* #undef HAVE_FORMAT_OVERFLOW */
 
 /* fsync_bdev() is declared in include/blkdev.h */
 /* #undef HAVE_FSYNC_BDEV */
 
 /* yes */
 /* #undef HAVE_GENERIC_FADVISE */
 
 /* generic_fillattr requires struct mnt_idmap* */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP */
 
 /* generic_fillattr requires struct mnt_idmap* and u32 request_mask */
 /* #undef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK */
 
 /* generic_fillattr requires struct user_namespace* */
 /* #undef HAVE_GENERIC_FILLATTR_USERNS */
 
 /* generic_*_io_acct() 4 arg available */
 /* #undef HAVE_GENERIC_IO_ACCT_4ARG */
 
 /* GENHD_FL_EXT_DEVT flag is available */
 /* #undef HAVE_GENHD_FL_EXT_DEVT */
 
 /* GENHD_FL_NO_PART flag is available */
 /* #undef HAVE_GENHD_FL_NO_PART */
 
 /* Define if the GNU gettext() function is already present or preinstalled. */
 /* #undef HAVE_GETTEXT */
 
 /* Define to 1 if you have the 'gettid' function. */
 /* #undef HAVE_GETTID */
 
 /* iops->get_acl() exists */
 /* #undef HAVE_GET_ACL */
 
 /* iops->get_acl() takes rcu */
 /* #undef HAVE_GET_ACL_RCU */
 
 /* has iops->get_inode_acl() */
 /* #undef HAVE_GET_INODE_ACL */
 
 /* iattr->ia_vfsuid and iattr->ia_vfsgid exist */
 /* #undef HAVE_IATTR_VFSID */
 
 /* Define if you have the iconv() function and it works. */
 #define HAVE_ICONV 1
 
 /* iops->getattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_GETATTR */
 
 /* iops->setattr() takes struct mnt_idmap* */
 /* #undef HAVE_IDMAP_IOPS_SETATTR */
 
 /* APIs for idmapped mount are present */
 /* #undef HAVE_IDMAP_MNT_API */
 
 /* mnt_idmap does not have user_namespace */
 /* #undef HAVE_IDMAP_NO_USERNS */
 
 /* Define if compiler supports -Wimplicit-fallthrough */
 /* #undef HAVE_IMPLICIT_FALLTHROUGH */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_INFINITE_RECURSION */
 
 /* inode_get_atime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_ATIME */
 
 /* inode_get_ctime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_CTIME */
 
 /* inode_get_mtime() exists in linux/fs.h */
 /* #undef HAVE_INODE_GET_MTIME */
 
 /* inode_owner_or_capable() exists */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE */
 
 /* inode_owner_or_capable() takes mnt_idmap */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_IDMAP */
 
 /* inode_owner_or_capable() takes user_ns */
 /* #undef HAVE_INODE_OWNER_OR_CAPABLE_USERNS */
 
 /* inode_set_atime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_ATIME_TO_TS */
 
 /* inode_set_ctime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_CTIME_TO_TS */
 
 /* inode_set_mtime_to_ts() exists in linux/fs.h */
 /* #undef HAVE_INODE_SET_MTIME_TO_TS */
 
 /* timestamp_truncate() exists */
 /* #undef HAVE_INODE_TIMESTAMP_TRUNCATE */
 
 /* Define to 1 if you have the <inttypes.h> header file. */
 #define HAVE_INTTYPES_H 1
 
 /* iops->create() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_CREATE_IDMAP */
 
 /* iops->create() takes struct user_namespace* */
 /* #undef HAVE_IOPS_CREATE_USERNS */
 
 /* iops->mkdir() returns struct dentry* */
 /* #undef HAVE_IOPS_MKDIR_DENTRY */
 
 /* iops->mkdir() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKDIR_IDMAP */
 
 /* iops->mkdir() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKDIR_USERNS */
 
 /* iops->mknod() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_MKNOD_IDMAP */
 
 /* iops->mknod() takes struct user_namespace* */
 /* #undef HAVE_IOPS_MKNOD_USERNS */
 
 /* iops->permission() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_PERMISSION_IDMAP */
 
 /* iops->permission() takes struct user_namespace* */
 /* #undef HAVE_IOPS_PERMISSION_USERNS */
 
 /* iops->rename() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_RENAME_IDMAP */
 
 /* iops->rename() takes struct user_namespace* */
 /* #undef HAVE_IOPS_RENAME_USERNS */
 
 /* iops->symlink() takes struct mnt_idmap* */
 /* #undef HAVE_IOPS_SYMLINK_IDMAP */
 
 /* iops->symlink() takes struct user_namespace* */
 /* #undef HAVE_IOPS_SYMLINK_USERNS */
 
 /* iov_iter_get_pages2() is available */
 /* #undef HAVE_IOV_ITER_GET_PAGES2 */
 
 /* iov_iter_type() is available */
 /* #undef HAVE_IOV_ITER_TYPE */
 
 /* Define to 1 if you have the 'issetugid' function. */
 #define HAVE_ISSETUGID 1
 
 /* iter_iov() is available */
 /* #undef HAVE_ITER_IOV */
 
 /* iter_is_ubuf() is available */
 /* #undef HAVE_ITER_IS_UBUF */
 
 /* kernel has kernel_fpu_* functions */
 /* #undef HAVE_KERNEL_FPU */
 
 /* kernel has asm/fpu/api.h */
 /* #undef HAVE_KERNEL_FPU_API_HEADER */
 
 /* kernel fpu internal */
 /* #undef HAVE_KERNEL_FPU_INTERNAL */
 
 /* kernel has asm/fpu/internal.h */
 /* #undef HAVE_KERNEL_FPU_INTERNAL_HEADER */
 
 /* Define if compiler supports -Winfinite-recursion */
 /* #undef HAVE_KERNEL_INFINITE_RECURSION */
 
 /* kernel defines intptr_t */
 /* #undef HAVE_KERNEL_INTPTR_T */
 
 /* kernel has kernel_neon_* functions */
 /* #undef HAVE_KERNEL_NEON */
 
 /* kernel does stack verification */
 /* #undef HAVE_KERNEL_OBJTOOL */
 
 /* kernel has linux/objtool.h */
 /* #undef HAVE_KERNEL_OBJTOOL_HEADER */
 
 /* strlcpy() exists */
 /* #undef HAVE_KERNEL_STRLCPY */
 
 /* kernel has kmap_local_page */
 /* #undef HAVE_KMAP_LOCAL_PAGE */
 
 /* Define if you have [aio] */
 /* #undef HAVE_LIBAIO */
 
 /* Define if you have [blkid] */
 /* #undef HAVE_LIBBLKID */
 
 /* Define if you have [crypto] */
 #define HAVE_LIBCRYPTO 1
 
 /* Define if you have [tirpc] */
 /* #undef HAVE_LIBTIRPC */
 
 /* Define if you have [udev] */
 /* #undef HAVE_LIBUDEV */
 
 /* Define if you have [unwind] */
 /* #undef HAVE_LIBUNWIND */
 
 /* libunwind has unw_get_elf_filename */
 /* #undef HAVE_LIBUNWIND_ELF */
 
 /* Define if you have [uuid] */
 /* #undef HAVE_LIBUUID */
 
 /* building against unsupported kernel version */
 /* #undef HAVE_LINUX_EXPERIMENTAL */
 
-/* Define to 1 if you have the <linux/stat.h> header file. */
-/* #undef HAVE_LINUX_STAT_H */
-
 /* makedev() is declared in sys/mkdev.h */
 /* #undef HAVE_MAKEDEV_IN_MKDEV */
 
 /* makedev() is declared in sys/sysmacros.h */
 /* #undef HAVE_MAKEDEV_IN_SYSMACROS */
 
 /* Noting that make_request_fn() returns blk_qc_t */
 /* #undef HAVE_MAKE_REQUEST_FN_RET_QC */
 
 /* Define to 1 if you have the 'mlockall' function. */
 #define HAVE_MLOCKALL 1
 
 /* PG_error flag is available */
 /* #undef HAVE_MM_PAGE_FLAG_ERROR */
 
 /* page_mapping() is available */
 /* #undef HAVE_MM_PAGE_MAPPING */
 
 /* page_size() is available */
 /* #undef HAVE_MM_PAGE_SIZE */
 
 /* Define if host toolchain supports MOVBE */
 #define HAVE_MOVBE 1
 
 /* folio_wait_bit() exists */
 /* #undef HAVE_PAGEMAP_FOLIO_WAIT_BIT */
 
 /* readahead_page() exists */
 /* #undef HAVE_PAGEMAP_READAHEAD_PAGE */
 
 /* part_to_dev() exists */
 /* #undef HAVE_PART_TO_DEV */
 
 /* iops->getattr() takes a path */
 /* #undef HAVE_PATH_IOPS_GETATTR */
 
 /* Define if host toolchain supports PCLMULQDQ */
 #define HAVE_PCLMULQDQ 1
 
 /* pin_user_pages_unlocked() is available */
 /* #undef HAVE_PIN_USER_PAGES_UNLOCKED */
 
 /* proc_handler ctl_table arg is const */
 /* #undef HAVE_PROC_HANDLER_CTL_TABLE_CONST */
 
 /* proc_ops structure exists */
 /* #undef HAVE_PROC_OPS_STRUCT */
 
 /* If available, contains the Python version number currently in use. */
 /* #undef HAVE_PYTHON */
 
 /* qat is enabled and existed */
 /* #undef HAVE_QAT */
 
 /* struct reclaim_state has reclaimed */
 /* #undef HAVE_RECLAIM_STATE_RECLAIMED */
 
 /* register_shrinker is vararg */
 /* #undef HAVE_REGISTER_SHRINKER_VARARG */
 
 /* register_sysctl_sz exists */
 /* #undef HAVE_REGISTER_SYSCTL_SZ */
 
 /* register_sysctl_table exists */
 /* #undef HAVE_REGISTER_SYSCTL_TABLE */
 
 /* iops->rename() wants flags */
 /* #undef HAVE_RENAME_WANTS_FLAGS */
 
 /* revalidate_disk() is available */
 /* #undef HAVE_REVALIDATE_DISK */
 
 /* revalidate_disk_size() is available */
 /* #undef HAVE_REVALIDATE_DISK_SIZE */
 
 /* Define to 1 if you have the <security/pam_modules.h> header file. */
 #define HAVE_SECURITY_PAM_MODULES_H 1
 
 /* setattr_prepare() accepts mnt_idmap */
 /* #undef HAVE_SETATTR_PREPARE_IDMAP */
 
 /* setattr_prepare() is available, doesn't accept user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_NO_USERNS */
 
 /* setattr_prepare() accepts user_namespace */
 /* #undef HAVE_SETATTR_PREPARE_USERNS */
 
 /* iops->set_acl() takes 4 args, arg1 is struct mnt_idmap * */
 /* #undef HAVE_SET_ACL_IDMAP_DENTRY */
 
 /* iops->set_acl() takes 4 args */
 /* #undef HAVE_SET_ACL_USERNS */
 
 /* iops->set_acl() takes 4 args, arg2 is struct dentry * */
 /* #undef HAVE_SET_ACL_USERNS_DENTRY_ARG2 */
 
 /* shrinker_register exists */
 /* #undef HAVE_SHRINKER_REGISTER */
 
 /* kernel_siginfo_t exists */
 /* #undef HAVE_SIGINFO */
 
 /* sops->free_inode() exists */
 /* #undef HAVE_SOPS_FREE_INODE */
 
 #if defined(__amd64__) || defined(__i386__)
 /* Define if host toolchain supports SSE */
 #define HAVE_SSE 1
 
 /* Define if host toolchain supports SSE2 */
 #define HAVE_SSE2 1
 
 /* Define if host toolchain supports SSE3 */
 #define HAVE_SSE3 1
 
 /* Define if host toolchain supports SSE4.1 */
 #define HAVE_SSE4_1 1
 
 /* Define if host toolchain supports SSE4.2 */
 #define HAVE_SSE4_2 1
 
 /* Define if host toolchain supports SSSE3 */
 #define HAVE_SSSE3 1
 #endif
 
 /* STACK_FRAME_NON_STANDARD is defined */
 /* #undef HAVE_STACK_FRAME_NON_STANDARD */
 
 /* STACK_FRAME_NON_STANDARD asm macro is defined */
 /* #undef HAVE_STACK_FRAME_NON_STANDARD_ASM */
 
 /* standalone <linux/stdarg.h> exists */
 /* #undef HAVE_STANDALONE_LINUX_STDARG */
 
 /* statx() is available */
 /* #undef HAVE_STATX */
 
 /* STATX_MNT_ID is available */
 /* #undef HAVE_STATX_MNT_ID */
 
 /* Define to 1 if you have the <stdint.h> header file. */
 #define HAVE_STDINT_H 1
 
 /* Define to 1 if you have the <stdio.h> header file. */
 #define HAVE_STDIO_H 1
 
 /* Define to 1 if you have the <stdlib.h> header file. */
 #define HAVE_STDLIB_H 1
 
 /* Define to 1 if you have the <strings.h> header file. */
 #define HAVE_STRINGS_H 1
 
 /* Define to 1 if you have the <string.h> header file. */
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the 'strlcat' function. */
 #define HAVE_STRLCAT 1
 
 /* Define to 1 if you have the 'strlcpy' function. */
 #define HAVE_STRLCPY 1
 
 /* submit_bio is member of struct block_device_operations */
 /* #undef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */
 
 /* have super_block s_shrink */
 /* #undef HAVE_SUPER_BLOCK_S_SHRINK */
 
 /* have super_block s_shrink pointer */
 /* #undef HAVE_SUPER_BLOCK_S_SHRINK_PTR */
 
 /* have super_block s_wb_err */
 /* #undef HAVE_SUPER_BLOCK_S_WB_ERR */
 
 /* sync_blockdev() is declared in include/blkdev.h */
 /* #undef HAVE_SYNC_BLOCKDEV */
 
 /* struct kobj_type has default_groups */
 /* #undef HAVE_SYSFS_DEFAULT_GROUPS */
 
 /* Define to 1 if you have the <sys/stat.h> header file. */
 #define HAVE_SYS_STAT_H 1
 
 /* Define to 1 if you have the <sys/types.h> header file. */
 #define HAVE_SYS_TYPES_H 1
 
 /* timer_delete_sync is available */
 /* #undef HAVE_TIMER_DELETE_SYNC */
 
 /* i_op->tmpfile() uses old dentry signature */
 /* #undef HAVE_TMPFILE_DENTRY */
 
 /* i_op->tmpfile() has mnt_idmap */
 /* #undef HAVE_TMPFILE_IDMAP */
 
 /* i_op->tmpfile() has userns */
 /* #undef HAVE_TMPFILE_USERNS */
 
 /* totalhigh_pages() exists */
 /* #undef HAVE_TOTALHIGH_PAGES */
 
 /* kernel has totalram_pages() */
 /* #undef HAVE_TOTALRAM_PAGES_FUNC */
 
 /* Define to 1 if you have the 'udev_device_get_is_initialized' function. */
 /* #undef HAVE_UDEV_DEVICE_GET_IS_INITIALIZED */
 
 /* kernel has __kernel_fpu_* functions */
 /* #undef HAVE_UNDERSCORE_KERNEL_FPU */
 
 /* Define to 1 if you have the <unistd.h> header file. */
 #define HAVE_UNISTD_H 1
 
 /* iops->getattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_GETATTR */
 
 /* iops->setattr() takes struct user_namespace* */
 /* #undef HAVE_USERNS_IOPS_SETATTR */
 
 #ifdef __amd64__
 /* Define if host toolchain supports VAES */
 #define HAVE_VAES 1
 #endif
 
 /* fops->clone_file_range() is available */
 /* #undef HAVE_VFS_CLONE_FILE_RANGE */
 
 /* fops->dedupe_file_range() is available */
 /* #undef HAVE_VFS_DEDUPE_FILE_RANGE */
 
 /* filemap_dirty_folio exists */
 /* #undef HAVE_VFS_FILEMAP_DIRTY_FOLIO */
 
 /* generic_copy_file_range() is available */
 /* #undef HAVE_VFS_GENERIC_COPY_FILE_RANGE */
 
 /* migratepage exists */
 /* #undef HAVE_VFS_MIGRATEPAGE */
 
 /* migrate_folio exists */
 /* #undef HAVE_VFS_MIGRATE_FOLIO */
 
 /* address_space_operations->readpages exists */
 /* #undef HAVE_VFS_READPAGES */
 
 /* read_folio exists */
 /* #undef HAVE_VFS_READ_FOLIO */
 
 /* fops->remap_file_range() is available */
 /* #undef HAVE_VFS_REMAP_FILE_RANGE */
 
 /* __set_page_dirty_nobuffers exists */
 /* #undef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS */
 
 /* splice_copy_file_range() is available */
 /* #undef HAVE_VFS_SPLICE_COPY_FILE_RANGE */
 
 /* address_space_operations->writepage exists */
 /* #undef HAVE_VFS_WRITEPAGE */
 
 /* __vmalloc page flags exists */
 /* #undef HAVE_VMALLOC_PAGE_KERNEL */
 
 #ifdef __amd64__
 /* Define if host toolchain supports VPCLMULQDQ */
 #define HAVE_VPCLMULQDQ 1
 #endif
 
 /* int (*writepage_t)() takes struct folio* */
 /* #undef HAVE_WRITEPAGE_T_FOLIO */
 
 /* xattr_handler->get() wants dentry and inode and flags */
 /* #undef HAVE_XATTR_GET_DENTRY_INODE_FLAGS */
 
 /* xattr_handler->set() wants both dentry and inode */
 /* #undef HAVE_XATTR_SET_DENTRY_INODE */
 
 /* xattr_handler->set() takes mnt_idmap */
 /* #undef HAVE_XATTR_SET_IDMAP */
 
 /* xattr_handler->set() takes user_namespace */
 /* #undef HAVE_XATTR_SET_USERNS */
 
 /* Define if host toolchain supports XSAVE */
 #define HAVE_XSAVE 1
 
 /* Define if host toolchain supports XSAVEOPT */
 #define HAVE_XSAVEOPT 1
 
 /* Define if host toolchain supports XSAVES */
 #define HAVE_XSAVES 1
 
 /* ZERO_PAGE() is GPL-only */
 /* #undef HAVE_ZERO_PAGE_GPL_ONLY */
 
 /* Define if you have [z] */
 #define HAVE_ZLIB 1
 
 /* libunwind is llvm libunwind */
 /* #undef IS_LIBUNWIND_LLVM */
 
 /* kernel exports FPU functions */
 /* #undef KERNEL_EXPORTS_X86_FPU */
 
 /* TBD: fetch(3) support */
 #if 0
 /* whether the chosen libfetch is to be loaded at run-time */
 #define LIBFETCH_DYNAMIC 1
 
 /* libfetch is fetch(3) */
 #define LIBFETCH_IS_FETCH 1
 
 /* libfetch is libcurl */
 #define LIBFETCH_IS_LIBCURL 0
 
 /* soname of chosen libfetch */
 #define LIBFETCH_SONAME "libfetch.so.6"
 #endif
 
 /* Define to the sub-directory where libtool stores uninstalled libraries. */
 #define LT_OBJDIR ".libs/"
 
 /* make_request_fn() return type */
 /* #undef MAKE_REQUEST_FN_RET */
 
 /* using complete_and_exit() instead */
 /* #undef SPL_KTHREAD_COMPLETE_AND_EXIT */
 
 /* Defined for legacy compatibility. */
 #define SPL_META_ALIAS ZFS_META_ALIAS
 
 /* Defined for legacy compatibility. */
 #define SPL_META_RELEASE ZFS_META_RELEASE
 
 /* Defined for legacy compatibility. */
 #define SPL_META_VERSION ZFS_META_VERSION
 
 /* pde_data() is PDE_DATA() */
 /* #undef SPL_PDE_DATA */
 
 /* Define to 1 if all of the C89 standard headers exist (not just the ones
    required in a freestanding environment). This macro is provided for
    backward compatibility; new code need not use it. */
 #define SYSTEM_FREEBSD 1
 
 /* True if ZFS is to be compiled for a Linux system */
 /* #undef SYSTEM_LINUX */
 
 /* Version number of package */
 /* #undef ZFS_DEBUG */
 
 /* /dev/zfs minor */
 /* #undef ZFS_DEVICE_MINOR */
 
 /* Define the project alias string. */
-#define ZFS_META_ALIAS "zfs-2.4.0-rc1-FreeBSD_g00dfa094a"
+#define ZFS_META_ALIAS "zfs-2.4.99-29-FreeBSD_g7939bad5e"
 
 /* Define the project author. */
 #define ZFS_META_AUTHOR "OpenZFS"
 
 /* Define the project release date. */
 /* #undef ZFS_META_DATA */
 
 /* Define the maximum compatible kernel version. */
 #define ZFS_META_KVER_MAX "6.16"
 
 /* Define the minimum compatible kernel version. */
 #define ZFS_META_KVER_MIN "4.18"
 
 /* Define the project license. */
 #define ZFS_META_LICENSE "CDDL"
 
 /* Define the libtool library 'age' version information. */
 /* #undef ZFS_META_LT_AGE */
 
 /* Define the libtool library 'current' version information. */
 /* #undef ZFS_META_LT_CURRENT */
 
 /* Define the libtool library 'revision' version information. */
 /* #undef ZFS_META_LT_REVISION */
 
 /* Define the project name. */
 #define ZFS_META_NAME "zfs"
 
 /* Define the project release. */
-#define ZFS_META_RELEASE "zfs-2.4.0-rc1-FreeBSD_g00dfa094a"
+#define ZFS_META_RELEASE "29-FreeBSD_g7939bad5e"
 
 /* Define the project version. */
-#define ZFS_META_VERSION "2.4.0"
+#define ZFS_META_VERSION "2.4.99"
 
 /* count is located in percpu_ref.data */
 /* #undef ZFS_PERCPU_REF_COUNT_IN_DATA */
diff --git a/sys/modules/zfs/zfs_gitrev.h b/sys/modules/zfs/zfs_gitrev.h
index 2b5d717da216..fff89435a0ff 100644
--- a/sys/modules/zfs/zfs_gitrev.h
+++ b/sys/modules/zfs/zfs_gitrev.h
@@ -1 +1 @@
-#define	ZFS_META_GITREV "zfs-2.4.0-rc1-0-g00dfa094a"
+#define	ZFS_META_GITREV "zfs-2.4.99-29-g7939bad5e"